[llvm] 58187fa - [CostModel][X86] Update masked load/store/gather/scatter tests to explicitly use variable masks

Fri Apr 5 03:16:05 PDT 2024

Author: Simon Pilgrim
Date: 2024-04-05T11:15:46+01:00
New Revision: 58187fad93bca76d1cce43a5caa70728282d2ff8

URL: https://github.com/llvm/llvm-project/commit/58187fad93bca76d1cce43a5caa70728282d2ff8
DIFF: https://github.com/llvm/llvm-project/commit/58187fad93bca76d1cce43a5caa70728282d2ff8.diff

LOG: [CostModel][X86] Update masked load/store/gather/scatter tests to explicitly use variable masks

Using <X x i1> undef masks means they are treated as constants, which underestimates the scalar costs as it assumes that the masks/branches will fold away.

Added: 
    

Modified: 
    llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
    llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
    llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
    llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
    llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
index 55fdaafc24edaa..d97c3d550efda0 100644

--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
@@ -8,1454 +8,1508 @@
 ; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
 ; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
 
-define i32 @masked_load() {
+define i32 @masked_load(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_load'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_load'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_load'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_load'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_load'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-
-  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-
-  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-
-  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-
-  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-
-  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_store() {
+define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_store'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_store'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_store'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_store'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_store'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-
-  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 
   ret i32 0
 }
 
-define i32 @masked_gather() {
+define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_gather'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 380 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_gather'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_gather'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_gather'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 323 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_gather'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 323 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
 
-  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
 
-  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
 
-  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
 
-  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
 
-  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_scatter() {
+define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_scatter'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 380 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_scatter'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_scatter'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 323 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_scatter'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 323 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 
-  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 
-  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 
-  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 
-  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 
-  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 
   ret i32 0
 }
 
-define i32 @masked_expandload() {
+define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-; AVX-LABEL: 'masked_expandload'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+; AVX1-LABEL: 'masked_expandload'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_expandload'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_expandload'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
 
-  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
 
-  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
 
-  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
 
-  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
 
-  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_compressstore() {
+define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-
-  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-
-  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-
-  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-
-  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-
-  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 
   ret i32 0
 }

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
index ad56c28510f531..b0f9f6d32a56fc 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
@@ -8,1454 +8,1562 @@
 ; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
 ; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
 
-define i32 @masked_load() {
+define i32 @masked_load(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_load'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_load'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_load'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_load'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_load'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-
-  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-
-  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-
-  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-
-  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-
-  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_store() {
+define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_store'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_store'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_store'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_store'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_store'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-
-  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 
   ret i32 0
 }
 
-define i32 @masked_gather() {
+define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_gather'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 316 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_gather'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_gather'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 210 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_gather'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 210 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 210 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 219 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 109 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_gather'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 219 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 109 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
 
-  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
 
-  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
 
-  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
 
-  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
 
-  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_scatter() {
+define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_scatter'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 316 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_scatter'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-; AVX-LABEL: 'masked_scatter'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 210 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; AVX1-LABEL: 'masked_scatter'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_scatter'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_scatter'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 219 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 109 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_scatter'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 219 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 109 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 
-  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 
-  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 
-  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 
-  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 
-  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 
   ret i32 0
 }
 
-define i32 @masked_expandload() {
+define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-; AVX-LABEL: 'masked_expandload'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; AVX1-LABEL: 'masked_expandload'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_expandload'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_expandload'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
 
-  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
 
-  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
 
-  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
 
-  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
 
-  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_compressstore() {
+define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-
-  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-
-  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-
-  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-
-  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-
-  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 
   ret i32 0
 }

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index c7e7c46482c4b0..46123e9f605749 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -8,1454 +8,1562 @@
 ; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
 ; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
 
-define i32 @masked_load() {
+define i32 @masked_load(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_load'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_load'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_load'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_load'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_load'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-
-  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-
-  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-
-  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-
-  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-
-  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_store() {
+define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_store'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_store'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_store'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_store'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_store'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-
-  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 
   ret i32 0
 }
 
-define i32 @masked_gather() {
+define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_gather'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 316 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_gather'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_gather'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 210 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_gather'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 210 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 210 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 219 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 109 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_gather'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 219 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 109 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
 
-  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
 
-  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
 
-  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
 
-  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
 
-  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_scatter() {
+define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_scatter'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 316 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 158 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_scatter'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-; AVX-LABEL: 'masked_scatter'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 210 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; AVX1-LABEL: 'masked_scatter'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_scatter'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_scatter'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 219 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 109 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_scatter'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 219 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 109 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 
-  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 
-  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 
-  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 
-  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 
-  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 
   ret i32 0
 }
 
-define i32 @masked_expandload() {
+define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-; AVX-LABEL: 'masked_expandload'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+; AVX1-LABEL: 'masked_expandload'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_expandload'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_expandload'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
 
-  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
 
-  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
 
-  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
 
-  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
 
-  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_compressstore() {
+define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
-  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-
-  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-
-  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-
-  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-
-  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-
-  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 
   ret i32 0
 }

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
index edb05ad71a401b..416005b4f5f46e 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
@@ -8,1454 +8,1508 @@
 ; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
 ; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
 
-define i32 @masked_load() {
+define i32 @masked_load(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_load'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_load'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_load'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_load'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_load'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-
-  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-
-  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-
-  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-
-  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-
-  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_store() {
+define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_store'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_store'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_store'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_store'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_store'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-
-  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 
   ret i32 0
 }
 
-define i32 @masked_gather() {
+define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_gather'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 380 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_gather'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_gather'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_gather'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 323 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_gather'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 323 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
 
-  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
 
-  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
 
-  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
 
-  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
 
-  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_scatter() {
+define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_scatter'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 380 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_scatter'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_scatter'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 323 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_scatter'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 323 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 
-  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 
-  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 
-  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 
-  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 
-  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 
   ret i32 0
 }
 
-define i32 @masked_expandload() {
+define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-; AVX-LABEL: 'masked_expandload'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+; AVX1-LABEL: 'masked_expandload'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_expandload'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_expandload'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
 
-  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
 
-  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
 
-  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
 
-  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
 
-  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_compressstore() {
+define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-
-  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-
-  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-
-  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-
-  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-
-  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 
   ret i32 0
 }

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
index 3ebd9cc41c09ec..b460df341b442c 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
@@ -8,1454 +8,1508 @@
 ; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
 ; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
 
-define i32 @masked_load() {
+define i32 @masked_load(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_load'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_load'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_load'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_load'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_load'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
-  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
-  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
-  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
-  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
-
-  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
-  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
-  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
-  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
-  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
-  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
-  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
-  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
-  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
-  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
-  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
-  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
-  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
-
-  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
-  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
-  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-
-  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
-  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
-  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
-  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
-  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
-  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
-  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
-  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
-  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-
-  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-
-  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_store() {
+define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_store'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_store'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_store'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_store'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_store'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
-  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
-  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
-  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
-  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
-  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
-  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
-  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
-  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
-  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
-  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
-  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
-  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
-  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
-  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
-  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
-  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
-  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
-  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
-
-  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
-
-  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
-  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
 
   ret i32 0
 }
 
-define i32 @masked_gather() {
+define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_gather'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 380 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_gather'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_gather'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_gather'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 323 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_gather'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 323 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
 
-  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
 
-  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
 
-  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
 
-  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
 
-  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_scatter() {
+define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_scatter'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 380 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_scatter'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX-LABEL: 'masked_scatter'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 323 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKX-LABEL: 'masked_scatter'
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 323 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 161 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 
-  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 
-  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
 
-  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
 
-  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
 
-  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
 
   ret i32 0
 }
 
-define i32 @masked_expandload() {
+define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-; AVX-LABEL: 'masked_expandload'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+; AVX1-LABEL: 'masked_expandload'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_expandload'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_expandload'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
-  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
-  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
-  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
 
-  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
-  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
-  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
-  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
 
-  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
 
-  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
-  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
-  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
-  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
 
-  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
-  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
-  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
-  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
 
-  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
-  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
-  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 
   ret i32 0
 }
 
-define i32 @masked_compressstore() {
+define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
-  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
-  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
-
-  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
-
-  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
-
-  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
-
-  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
-  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
-
-  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
-  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
-  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 
   ret i32 0
 }