[clang] 36ba42d - [IR][CodeGen][mlir] Overload pointer argument for compressstore/expandload intrinsics (#200492)

Fri Jun 5 08:04:24 PDT 2026

Author: Nick Sarnie
Date: 2026-06-05T15:04:18Z
New Revision: 36ba42d09737872243f1968067fa835fbaecbe73

URL: https://github.com/llvm/llvm-project/commit/36ba42d09737872243f1968067fa835fbaecbe73
DIFF: https://github.com/llvm/llvm-project/commit/36ba42d09737872243f1968067fa835fbaecbe73.diff

LOG: [IR][CodeGen][mlir] Overload pointer argument for compressstore/expandload intrinsics (#200492)

Update the `llvm.masked.expandload` and `llvm.masked.compressstore`
intrinsics to have overloaded pointer arguments so other address spaces
can be used. This puts these intrinsics in line with other similar
intrinsics, which already have overloaded pointer arguments.

The change may look large, but almost all of the changes are adding
`.p0` to tests.

This is needed for targets that have non-zero default address space, the
problem was found when trying to compile the libc LIT tests for SPIRV,
specifically
[this](https://github.com/llvm/llvm-project/blob/4ac26f45fa8f5c58a90effb903808cb0e908cf1c/libc/test/src/__support/CPP/simd_test.cpp)
one.

Co-Authored-By: Claude Sonnet 4.5 <noreply at anthropic.com>

---------

Signed-off-by: Nick Sarnie <nick.sarnie at intel.com>
Co-authored-by: Claude Sonnet 4.5 <noreply at anthropic.com>

Added: 
    clang/test/CodeGen/builtin-masked-addrspace.c
    llvm/test/Transforms/ScalarizeMaskedMemIntrin/SPIRV/expand-masked-compressstore.ll
    llvm/test/Transforms/ScalarizeMaskedMemIntrin/SPIRV/expand-masked-expandload.ll

Modified: 
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/lib/CodeGen/TargetBuiltins/X86.cpp
    clang/test/CodeGen/X86/avx512f-builtins.c
    clang/test/CodeGen/X86/avx512vbmi2-builtins.c
    clang/test/CodeGen/X86/avx512vl-builtins.c
    clang/test/CodeGen/X86/avx512vlvbmi2-builtins.c
    clang/test/CodeGen/builtin-masked.c
    llvm/include/llvm/IR/Intrinsics.td
    llvm/lib/IR/AutoUpgrade.cpp
    llvm/lib/IR/IRBuilder.cpp
    llvm/test/Analysis/CostModel/AArch64/masked_expand_load.ll
    llvm/test/Analysis/CostModel/PowerPC/ld-st-with-length.ll
    llvm/test/Analysis/CostModel/RISCV/gep.ll
    llvm/test/Analysis/CostModel/RISCV/rvv-expandload-compressstore.ll
    llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
    llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
    llvm/test/Assembler/masked-load-store-intrinsics-attributes.ll
    llvm/test/Bitcode/upgrade-masked-keep-metadata.ll
    llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
    llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
    llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll
    llvm/test/Other/force-opaque-ptrs.ll
    llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll
    mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
    mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 7ce38c5d1922c..0cb5f95049789 100644

--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4805,14 +4805,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         E->getType()->getAs<VectorType>()->getElementType(), nullptr);
 
     llvm::Value *Result;
-    if (BuiltinID == Builtin::BI__builtin_masked_load) {
+    if (BuiltinID == Builtin::BI__builtin_masked_load)
       Result = Builder.CreateMaskedLoad(RetTy, Ptr, Align.getAsAlign(), Mask,
                                         PassThru, "masked_load");
-    } else {
-      Function *F = CGM.getIntrinsic(Intrinsic::masked_expandload, {RetTy});
-      Result =
-          Builder.CreateCall(F, {Ptr, Mask, PassThru}, "masked_expand_load");
-    }
+    else
+      Result = Builder.CreateMaskedExpandLoad(RetTy, Ptr, MaybeAlign(), Mask,
+                                              PassThru, "masked_expand_load");
+
     return RValue::get(Result);
   };
   case Builtin::BI__builtin_masked_gather: {
@@ -4842,20 +4841,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm::Value *Val = EmitScalarExpr(E->getArg(1));
     llvm::Value *Ptr = EmitScalarExpr(E->getArg(2));
 
-    QualType ValTy = E->getArg(1)->getType();
-    llvm::Type *ValLLTy = CGM.getTypes().ConvertType(ValTy);
-
     CharUnits Align = CGM.getNaturalTypeAlignment(
         E->getArg(1)->getType()->getAs<VectorType>()->getElementType(),
         nullptr);
 
-    if (BuiltinID == Builtin::BI__builtin_masked_store) {
+    if (BuiltinID == Builtin::BI__builtin_masked_store)
       Builder.CreateMaskedStore(Val, Ptr, Align.getAsAlign(), Mask);
-    } else {
-      llvm::Function *F =
-          CGM.getIntrinsic(llvm::Intrinsic::masked_compressstore, {ValLLTy});
-      Builder.CreateCall(F, {Val, Ptr, Mask});
-    }
+    else
+      Builder.CreateMaskedCompressStore(Val, Ptr, MaybeAlign(), Mask);
+
     return RValue::get(nullptr);
   }
   case Builtin::BI__builtin_masked_scatter: {

diff  --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index ee1ccd83e3aa2..acfeb9967cd2f 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -161,9 +161,8 @@ static Value *EmitX86ExpandLoad(CodeGenFunction &CGF,
   Value *MaskVec = getMaskVecValue(
       CGF, Ops[2], cast<FixedVectorType>(ResultTy)->getNumElements());
 
-  llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload,
-                                           ResultTy);
-  return CGF.Builder.CreateCall(F, { Ptr, MaskVec, Ops[1] });
+  return CGF.Builder.CreateMaskedExpandLoad(ResultTy, Ptr, MaybeAlign(),
+                                            MaskVec, Ops[1]);
 }
 
 static Value *EmitX86CompressExpand(CodeGenFunction &CGF,
@@ -186,9 +185,8 @@ static Value *EmitX86CompressStore(CodeGenFunction &CGF,
 
   Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
 
-  llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_compressstore,
-                                           ResultTy);
-  return CGF.Builder.CreateCall(F, { Ops[1], Ptr, MaskVec });
+  return CGF.Builder.CreateMaskedCompressStore(Ops[1], Ptr, MaybeAlign(),
+                                               MaskVec);
 }
 
 static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc,

diff  --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 249a917a00461..50b44df032f81 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -9946,49 +9946,49 @@ TEST_CONSTEXPR(match_v8di(_mm512_maskz_expand_epi64(0x30, (__m512i)(__v8di){ 1,
 
 __m512i test_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_mask_expandloadu_epi64
-  // CHECK: @llvm.masked.expandload.v8i64(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v8i64.p0(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_mask_expandloadu_epi64(__W, __U, __P); 
 }
 
 __m512i test_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_maskz_expandloadu_epi64
-  // CHECK: @llvm.masked.expandload.v8i64(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v8i64.p0(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_maskz_expandloadu_epi64(__U, __P); 
 }
 
 __m512d test_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_mask_expandloadu_pd
-  // CHECK: @llvm.masked.expandload.v8f64(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v8f64.p0(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_mask_expandloadu_pd(__W, __U, __P); 
 }
 
 __m512d test_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_maskz_expandloadu_pd
-  // CHECK: @llvm.masked.expandload.v8f64(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v8f64.p0(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_maskz_expandloadu_pd(__U, __P); 
 }
 
 __m512i test_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_mask_expandloadu_epi32
-  // CHECK: @llvm.masked.expandload.v16i32(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v16i32.p0(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_mask_expandloadu_epi32(__W, __U, __P); 
 }
 
 __m512i test_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_maskz_expandloadu_epi32
-  // CHECK: @llvm.masked.expandload.v16i32(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v16i32.p0(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_maskz_expandloadu_epi32(__U, __P); 
 }
 
 __m512 test_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_mask_expandloadu_ps
-  // CHECK: @llvm.masked.expandload.v16f32(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v16f32.p0(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_mask_expandloadu_ps(__W, __U, __P); 
 }
 
 __m512 test_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_maskz_expandloadu_ps
-  // CHECK: @llvm.masked.expandload.v16f32(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v16f32.p0(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_maskz_expandloadu_ps(__U, __P); 
 }
 
@@ -10114,25 +10114,25 @@ TEST_CONSTEXPR(match_m512(_mm512_maskz_mov_ps(0xF3F3, (__m512){+1.0f, +2.0f, +3.
 
 void test_mm512_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m512d __A) {
   // CHECK-LABEL: test_mm512_mask_compressstoreu_pd
-  // CHECK: @llvm.masked.compressstore.v8f64(<8 x double> %{{.*}}, ptr %{{.*}}, <8 x i1> %{{.*}})
+  // CHECK: @llvm.masked.compressstore.v8f64.p0(<8 x double> %{{.*}}, ptr %{{.*}}, <8 x i1> %{{.*}})
   return _mm512_mask_compressstoreu_pd(__P, __U, __A); 
 }
 
 void test_mm512_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_compressstoreu_epi64
-  // CHECK: @llvm.masked.compressstore.v8i64(<8 x i64> %{{.*}}, ptr %{{.*}}, <8 x i1> %{{.*}})
+  // CHECK: @llvm.masked.compressstore.v8i64.p0(<8 x i64> %{{.*}}, ptr %{{.*}}, <8 x i1> %{{.*}})
   return _mm512_mask_compressstoreu_epi64(__P, __U, __A); 
 }
 
 void test_mm512_mask_compressstoreu_ps(void *__P, __mmask16 __U, __m512 __A) {
   // CHECK-LABEL: test_mm512_mask_compressstoreu_ps
-  // CHECK: @llvm.masked.compressstore.v16f32(<16 x float> %{{.*}}, ptr %{{.*}}, <16 x i1> %{{.*}})
+  // CHECK: @llvm.masked.compressstore.v16f32.p0(<16 x float> %{{.*}}, ptr %{{.*}}, <16 x i1> %{{.*}})
   return _mm512_mask_compressstoreu_ps(__P, __U, __A); 
 }
 
 void test_mm512_mask_compressstoreu_epi32(void *__P, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_compressstoreu_epi32
-  // CHECK: @llvm.masked.compressstore.v16i32(<16 x i32> %{{.*}}, ptr %{{.*}}, <16 x i1> %{{.*}})
+  // CHECK: @llvm.masked.compressstore.v16i32.p0(<16 x i32> %{{.*}}, ptr %{{.*}}, <16 x i1> %{{.*}})
   return _mm512_mask_compressstoreu_epi32(__P, __U, __A); 
 }
 

diff  --git a/clang/test/CodeGen/X86/avx512vbmi2-builtins.c b/clang/test/CodeGen/X86/avx512vbmi2-builtins.c
index 1e71285b29b4e..c1df727a09400 100644
--- a/clang/test/CodeGen/X86/avx512vbmi2-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vbmi2-builtins.c
@@ -41,13 +41,13 @@ TEST_CONSTEXPR(match_v64qi(_mm512_maskz_compress_epi8(0x8000000000000003ULL, (__
 
 void test_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D) {
   // CHECK-LABEL: test_mm512_mask_compressstoreu_epi16
-  // CHECK: call void @llvm.masked.compressstore.v32i16(<32 x i16> %{{.*}}, ptr %{{.*}}, <32 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.compressstore.v32i16.p0(<32 x i16> %{{.*}}, ptr %{{.*}}, <32 x i1> %{{.*}})
   _mm512_mask_compressstoreu_epi16(__P, __U, __D);
 }
 
 void test_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D) {
   // CHECK-LABEL: test_mm512_mask_compressstoreu_epi8
-  // CHECK: call void @llvm.masked.compressstore.v64i8(<64 x i8> %{{.*}}, ptr %{{.*}}, <64 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.compressstore.v64i8.p0(<64 x i8> %{{.*}}, ptr %{{.*}}, <64 x i1> %{{.*}})
   _mm512_mask_compressstoreu_epi8(__P, __U, __D);
 }
 
@@ -81,25 +81,25 @@ TEST_CONSTEXPR(match_v64qi(_mm512_maskz_expand_epi8(0x416B5E0F4234A3D5, (__m512i
 
 __m512i test_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const* __P) {
   // CHECK-LABEL: test_mm512_mask_expandloadu_epi16
-  // CHECK: call <32 x i16> @llvm.masked.expandload.v32i16(ptr %{{.*}}, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
+  // CHECK: call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr %{{.*}}, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_mask_expandloadu_epi16(__S, __U, __P);
 }
 
 __m512i test_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const* __P) {
   // CHECK-LABEL: test_mm512_maskz_expandloadu_epi16
-  // CHECK: call <32 x i16> @llvm.masked.expandload.v32i16(ptr %{{.*}}, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
+  // CHECK: call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr %{{.*}}, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_maskz_expandloadu_epi16(__U, __P);
 }
 
 __m512i test_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const* __P) {
   // CHECK-LABEL: test_mm512_mask_expandloadu_epi8
-  // CHECK: call <64 x i8> @llvm.masked.expandload.v64i8(ptr %{{.*}}, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
+  // CHECK: call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr %{{.*}}, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_mask_expandloadu_epi8(__S, __U, __P);
 }
 
 __m512i test_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const* __P) {
   // CHECK-LABEL: test_mm512_maskz_expandloadu_epi8
-  // CHECK: call <64 x i8> @llvm.masked.expandload.v64i8(ptr %{{.*}}, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
+  // CHECK: call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr %{{.*}}, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_maskz_expandloadu_epi8(__U, __P);
 }
 

diff  --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 7d42943d68ca8..320a8b7c2cf4c 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -3877,42 +3877,42 @@ TEST_CONSTEXPR(match_v8si(_mm256_maskz_compress_epi32(0xA5, (__m256i)(__v8si){0,
 
 void test_mm_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m128d __A) {
   // CHECK-LABEL: test_mm_mask_compressstoreu_pd
-  // CHECK: @llvm.masked.compressstore.v2f64(<2 x double> %{{.*}}, ptr %{{.*}}, <2 x i1> %{{.*}})
+  // CHECK: @llvm.masked.compressstore.v2f64.p0(<2 x double> %{{.*}}, ptr %{{.*}}, <2 x i1> %{{.*}})
   return _mm_mask_compressstoreu_pd(__P,__U,__A); 
 }
 void test_mm256_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m256d __A) {
   // CHECK-LABEL: test_mm256_mask_compressstoreu_pd
-  // CHECK: @llvm.masked.compressstore.v4f64(<4 x double> %{{.*}}, ptr %{{.*}}, <4 x i1> %{{.*}})
+  // CHECK: @llvm.masked.compressstore.v4f64.p0(<4 x double> %{{.*}}, ptr %{{.*}}, <4 x i1> %{{.*}})
   return _mm256_mask_compressstoreu_pd(__P,__U,__A); 
 }
 void test_mm_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_compressstoreu_epi64
-  // CHECK: @llvm.masked.compressstore.v2i64(<2 x i64> %{{.*}}, ptr %{{.*}}, <2 x i1> %{{.*}})
+  // CHECK: @llvm.masked.compressstore.v2i64.p0(<2 x i64> %{{.*}}, ptr %{{.*}}, <2 x i1> %{{.*}})
   return _mm_mask_compressstoreu_epi64(__P,__U,__A); 
 }
 void test_mm256_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_compressstoreu_epi64
-  // CHECK: @llvm.masked.compressstore.v4i64(<4 x i64> %{{.*}}, ptr %{{.*}}, <4 x i1> %{{.*}})
+  // CHECK: @llvm.masked.compressstore.v4i64.p0(<4 x i64> %{{.*}}, ptr %{{.*}}, <4 x i1> %{{.*}})
   return _mm256_mask_compressstoreu_epi64(__P,__U,__A); 
 }
 void test_mm_mask_compressstoreu_ps(void *__P, __mmask8 __U, __m128 __A) {
   // CHECK-LABEL: test_mm_mask_compressstoreu_ps
-  // CHECK: @llvm.masked.compressstore.v4f32(<4 x float> %{{.*}}, ptr %{{.*}}, <4 x i1> %{{.*}})
+  // CHECK: @llvm.masked.compressstore.v4f32.p0(<4 x float> %{{.*}}, ptr %{{.*}}, <4 x i1> %{{.*}})
   return _mm_mask_compressstoreu_ps(__P,__U,__A); 
 }
 void test_mm256_mask_compressstoreu_ps(void *__P, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: test_mm256_mask_compressstoreu_ps
-  // CHECK: @llvm.masked.compressstore.v8f32(<8 x float> %{{.*}}, ptr %{{.*}}, <8 x i1> %{{.*}})
+  // CHECK: @llvm.masked.compressstore.v8f32.p0(<8 x float> %{{.*}}, ptr %{{.*}}, <8 x i1> %{{.*}})
   return _mm256_mask_compressstoreu_ps(__P,__U,__A); 
 }
 void test_mm_mask_compressstoreu_epi32(void *__P, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_compressstoreu_epi32
-  // CHECK: @llvm.masked.compressstore.v4i32(<4 x i32> %{{.*}}, ptr %{{.*}}, <4 x i1> %{{.*}})
+  // CHECK: @llvm.masked.compressstore.v4i32.p0(<4 x i32> %{{.*}}, ptr %{{.*}}, <4 x i1> %{{.*}})
   return _mm_mask_compressstoreu_epi32(__P,__U,__A); 
 }
 void test_mm256_mask_compressstoreu_epi32(void *__P, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_compressstoreu_epi32
-  // CHECK: @llvm.masked.compressstore.v8i32(<8 x i32> %{{.*}}, ptr %{{.*}}, <8 x i1> %{{.*}})
+  // CHECK: @llvm.masked.compressstore.v8i32.p0(<8 x i32> %{{.*}}, ptr %{{.*}}, <8 x i1> %{{.*}})
   return _mm256_mask_compressstoreu_epi32(__P,__U,__A); 
 }
 __m128d test_mm_mask_cvtepi32_pd(__m128d __W, __mmask8 __U, __m128i __A) {
@@ -4484,82 +4484,82 @@ TEST_CONSTEXPR(match_v4di(_mm256_maskz_expand_epi64(0xB, (__m256i)(__v4di){ 1, 2
 
 __m128d test_mm_mask_expandloadu_pd(__m128d __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_expandloadu_pd
-  // CHECK: @llvm.masked.expandload.v2f64(ptr %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v2f64.p0(ptr %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
   return _mm_mask_expandloadu_pd(__W,__U,__P); 
 }
 __m128d test_mm_maskz_expandloadu_pd(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_expandloadu_pd
-  // CHECK: @llvm.masked.expandload.v2f64(ptr %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v2f64.p0(ptr %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
   return _mm_maskz_expandloadu_pd(__U,__P); 
 }
 __m256d test_mm256_mask_expandloadu_pd(__m256d __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_mask_expandloadu_pd
-  // CHECK: @llvm.masked.expandload.v4f64(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v4f64.p0(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_mask_expandloadu_pd(__W,__U,__P); 
 }
 __m256d test_mm256_maskz_expandloadu_pd(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_expandloadu_pd
-  // CHECK: @llvm.masked.expandload.v4f64(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v4f64.p0(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_maskz_expandloadu_pd(__U,__P); 
 }
 __m128i test_mm_mask_expandloadu_epi64(__m128i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_expandloadu_epi64
-  // CHECK: @llvm.masked.expandload.v2i64(ptr %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v2i64.p0(ptr %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_mask_expandloadu_epi64(__W,__U,__P); 
 }
 __m128i test_mm_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_expandloadu_epi64
-  // CHECK: @llvm.masked.expandload.v2i64(ptr %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v2i64.p0(ptr %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_maskz_expandloadu_epi64(__U,__P); 
 }
 __m256i test_mm256_mask_expandloadu_epi64(__m256i __W, __mmask8 __U,   void const *__P) {
   // CHECK-LABEL: test_mm256_mask_expandloadu_epi64
-  // CHECK: @llvm.masked.expandload.v4i64(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v4i64.p0(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_mask_expandloadu_epi64(__W,__U,__P); 
 }
 __m256i test_mm256_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_expandloadu_epi64
-  // CHECK: @llvm.masked.expandload.v4i64(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v4i64.p0(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_maskz_expandloadu_epi64(__U,__P); 
 }
 __m128 test_mm_mask_expandloadu_ps(__m128 __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_expandloadu_ps
-  // CHECK: @llvm.masked.expandload.v4f32(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v4f32.p0(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
   return _mm_mask_expandloadu_ps(__W,__U,__P); 
 }
 __m128 test_mm_maskz_expandloadu_ps(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_expandloadu_ps
-  // CHECK: @llvm.masked.expandload.v4f32(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v4f32.p0(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
   return _mm_maskz_expandloadu_ps(__U,__P); 
 }
 __m256 test_mm256_mask_expandloadu_ps(__m256 __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_mask_expandloadu_ps
-  // CHECK: @llvm.masked.expandload.v8f32(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v8f32.p0(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_mask_expandloadu_ps(__W,__U,__P); 
 }
 __m256 test_mm256_maskz_expandloadu_ps(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_expandloadu_ps
-  // CHECK: @llvm.masked.expandload.v8f32(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v8f32.p0(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_maskz_expandloadu_ps(__U,__P); 
 }
 __m128i test_mm_mask_expandloadu_epi32(__m128i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_expandloadu_epi32
-  // CHECK: @llvm.masked.expandload.v4i32(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v4i32.p0(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_mask_expandloadu_epi32(__W,__U,__P); 
 }
 __m128i test_mm_maskz_expandloadu_epi32(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_expandloadu_epi32
-  // CHECK: @llvm.masked.expandload.v4i32(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v4i32.p0(ptr %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maskz_expandloadu_epi32(__U,__P); 
 }
 __m256i test_mm256_mask_expandloadu_epi32(__m256i __W, __mmask8 __U,   void const *__P) {
   // CHECK-LABEL: test_mm256_mask_expandloadu_epi32
-  // CHECK: @llvm.masked.expandload.v8i32(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v8i32.p0(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_mask_expandloadu_epi32(__W,__U,__P); 
 }
 __m256i test_mm256_maskz_expandloadu_epi32(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_expandloadu_epi32
-  // CHECK: @llvm.masked.expandload.v8i32(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: @llvm.masked.expandload.v8i32.p0(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_maskz_expandloadu_epi32(__U,__P); 
 }
 __m128 test_mm_mask_expand_ps(__m128 __W, __mmask8 __U, __m128 __A) {

diff  --git a/clang/test/CodeGen/X86/avx512vlvbmi2-builtins.c b/clang/test/CodeGen/X86/avx512vlvbmi2-builtins.c
index 94b16a9bd3541..e62546d1a4634 100644
--- a/clang/test/CodeGen/X86/avx512vlvbmi2-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlvbmi2-builtins.c
@@ -41,13 +41,13 @@ TEST_CONSTEXPR(match_v16qi(_mm_maskz_compress_epi8(0x8003, (__m128i)(__v16qi){0,
 
 void test_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D) {
   // CHECK-LABEL: test_mm_mask_compressstoreu_epi16
-  // CHECK: call void @llvm.masked.compressstore.v8i16(<8 x i16> %{{.*}}, ptr %{{.*}}, <8 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.compressstore.v8i16.p0(<8 x i16> %{{.*}}, ptr %{{.*}}, <8 x i1> %{{.*}})
   _mm_mask_compressstoreu_epi16(__P, __U, __D);
 }
 
 void test_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D) {
   // CHECK-LABEL: test_mm_mask_compressstoreu_epi8
-  // CHECK: call void @llvm.masked.compressstore.v16i8(<16 x i8> %{{.*}}, ptr %{{.*}}, <16 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> %{{.*}}, ptr %{{.*}}, <16 x i1> %{{.*}})
   _mm_mask_compressstoreu_epi8(__P, __U, __D);
 }
 
@@ -81,25 +81,25 @@ TEST_CONSTEXPR(match_v16qi(_mm_maskz_expand_epi8(0x4A9C, (__m128i)(__v16qi){ 1,
 
 __m128i test_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const* __P) {
   // CHECK-LABEL: test_mm_mask_expandloadu_epi16
-  // CHECK: call <8 x i16> @llvm.masked.expandload.v8i16(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_mask_expandloadu_epi16(__S, __U, __P);
 }
 
 __m128i test_mm_maskz_expandloadu_epi16(__mmask8 __U, void const* __P) {
   // CHECK-LABEL: test_mm_maskz_expandloadu_epi16
-  // CHECK: call <8 x i16> @llvm.masked.expandload.v8i16(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_maskz_expandloadu_epi16(__U, __P);
 }
 
 __m128i test_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const* __P) {
   // CHECK-LABEL: test_mm_mask_expandloadu_epi8
-  // CHECK: call <16 x i8> @llvm.masked.expandload.v16i8(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_mask_expandloadu_epi8(__S, __U, __P);
 }
 
 __m128i test_mm_maskz_expandloadu_epi8(__mmask16 __U, void const* __P) {
   // CHECK-LABEL: test_mm_maskz_expandloadu_epi8
-  // CHECK: call <16 x i8> @llvm.masked.expandload.v16i8(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_maskz_expandloadu_epi8(__U, __P);
 }
 
@@ -133,13 +133,13 @@ TEST_CONSTEXPR(match_v32qi(_mm256_maskz_compress_epi8(0x80000003, (__m256i)(__v3
 
 void test_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D) {
   // CHECK-LABEL: test_mm256_mask_compressstoreu_epi16
-  // CHECK: call void @llvm.masked.compressstore.v16i16(<16 x i16> %{{.*}}, ptr %{{.*}}, <16 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.compressstore.v16i16.p0(<16 x i16> %{{.*}}, ptr %{{.*}}, <16 x i1> %{{.*}})
   _mm256_mask_compressstoreu_epi16(__P, __U, __D);
 }
 
 void test_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D) {
   // CHECK-LABEL: test_mm256_mask_compressstoreu_epi8
-  // CHECK: call void @llvm.masked.compressstore.v32i8(<32 x i8> %{{.*}}, ptr %{{.*}}, <32 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.compressstore.v32i8.p0(<32 x i8> %{{.*}}, ptr %{{.*}}, <32 x i1> %{{.*}})
   _mm256_mask_compressstoreu_epi8(__P, __U, __D);
 }
 
@@ -173,25 +173,25 @@ TEST_CONSTEXPR(match_v32qi(_mm256_maskz_expand_epi8(0x134DA768, (__m256i)(__v32q
 
 __m256i test_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const* __P) {
   // CHECK-LABEL: test_mm256_mask_expandloadu_epi16
-  // CHECK: call <16 x i16> @llvm.masked.expandload.v16i16(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
+  // CHECK: call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_mask_expandloadu_epi16(__S, __U, __P);
 }
 
 __m256i test_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const* __P) {
   // CHECK-LABEL: test_mm256_maskz_expandloadu_epi16
-  // CHECK: call <16 x i16> @llvm.masked.expandload.v16i16(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
+  // CHECK: call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_maskz_expandloadu_epi16(__U, __P);
 }
 
 __m256i test_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const* __P) {
   // CHECK-LABEL: test_mm256_mask_expandloadu_epi8
-  // CHECK: call <32 x  i8> @llvm.masked.expandload.v32i8(ptr %{{.*}}, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
+  // CHECK: call <32 x  i8> @llvm.masked.expandload.v32i8.p0(ptr %{{.*}}, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_mask_expandloadu_epi8(__S, __U, __P);
 }
 
 __m256i test_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const* __P) {
   // CHECK-LABEL: test_mm256_maskz_expandloadu_epi8
-  // CHECK: call <32 x i8> @llvm.masked.expandload.v32i8(ptr %{{.*}}, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
+  // CHECK: call <32 x i8> @llvm.masked.expandload.v32i8.p0(ptr %{{.*}}, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_maskz_expandloadu_epi8(__U, __P);
 }
 

diff  --git a/clang/test/CodeGen/builtin-masked-addrspace.c b/clang/test/CodeGen/builtin-masked-addrspace.c
new file mode 100644
index 0000000000000..5a40fb74fa7e6
--- /dev/null
+++ b/clang/test/CodeGen/builtin-masked-addrspace.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -triple spirv64-intel -emit-llvm -o - %s | FileCheck %s
+
+// Test we can call the intrinsics with non-zero AS pointers.
+
+typedef int v8i __attribute__((ext_vector_type(8)));
+typedef _Bool v8b __attribute__((ext_vector_type(8)));
+
+// CHECK-LABEL: define spir_func <8 x i32> @test_load_expand(
+// CHECK: call addrspace(9) <8 x i32> @llvm.masked.expandload.v8i32.p4(ptr addrspace(4) %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+v8i test_load_expand(v8b m, int *p, v8i t) {
+  return __builtin_masked_expand_load(m, p, t);
+}
+
+// CHECK-LABEL: define spir_func void @test_compress_store(
+// CHECK: call addrspace(9) void @llvm.masked.compressstore.v8i32.p4(<8 x i32> %{{.*}}, ptr addrspace(4) %{{.*}}, <8 x i1> %{{.*}})
+void test_compress_store(v8b m, v8i v, int *p) {
+  __builtin_masked_compress_store(m, v, p);
+}

diff  --git a/clang/test/CodeGen/builtin-masked.c b/clang/test/CodeGen/builtin-masked.c
index f5a4b7511491c..c1ae989d50411 100644
--- a/clang/test/CodeGen/builtin-masked.c
+++ b/clang/test/CodeGen/builtin-masked.c
@@ -72,7 +72,7 @@ v8i test_load_passthru(v8b m, int *p, v8i t) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[P_ADDR]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr [[T_ADDR]], align 32
-// CHECK-NEXT:    [[MASKED_EXPAND_LOAD:%.*]] = call <8 x i32> @llvm.masked.expandload.v8i32(ptr [[TMP3]], <8 x i1> [[TMP2]], <8 x i32> [[TMP4]])
+// CHECK-NEXT:    [[MASKED_EXPAND_LOAD:%.*]] = call <8 x i32> @llvm.masked.expandload.v8i32.p0(ptr [[TMP3]], <8 x i1> [[TMP2]], <8 x i32> [[TMP4]])
 // CHECK-NEXT:    ret <8 x i32> [[MASKED_EXPAND_LOAD]]
 //
 v8i test_load_expand(v8b m, int *p, v8i t) {
@@ -150,7 +150,7 @@ void gtest_store(v8b m, gv8i v, int *p) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
 // CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[V_ADDR]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[P_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.masked.compressstore.v8i32(<8 x i32> [[TMP3]], ptr [[TMP4]], <8 x i1> [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> [[TMP3]], ptr [[TMP4]], <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_compress_store(v8b m, v8i v, int *p) {
@@ -176,7 +176,7 @@ void test_compress_store(v8b m, v8i v, int *p) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
 // CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[V_ADDR]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[P_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.masked.compressstore.v8i32(<8 x i32> [[TMP3]], ptr [[TMP4]], <8 x i1> [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> [[TMP3]], ptr [[TMP4]], <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void gtest_compress_store(v8b m, gv8i v, int *p) {

diff  --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 0f9859d730182..611f1b6bb19e0 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2669,13 +2669,13 @@ def int_masked_scatter:
 
 def int_masked_expandload:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-            [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+            [llvm_anyptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
              LLVMMatchType<0>],
             [IntrReadMem, NoCapture<ArgIndex<0>>]>;
 
 def int_masked_compressstore:
   DefaultAttrsIntrinsic<[],
-            [llvm_anyvector_ty, llvm_ptr_ty,
+            [llvm_anyvector_ty, llvm_anyptr_ty,
              LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
             [IntrWriteMem, IntrArgMemOnly,
              NoCapture<ArgIndex<1>>]>;

diff  --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 44076e0ce2442..0770f0f0ff060 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -3307,20 +3307,20 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
                             CI->getArgOperand(2), Aligned);
   } else if (Name.starts_with("avx512.mask.expand.load.")) {
     auto *ResultTy = cast<FixedVectorType>(CI->getType());
+    auto *PtrTy = CI->getOperand(0)->getType();
     Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
                                    ResultTy->getNumElements());
-
     Rep = Builder.CreateIntrinsic(
-        Intrinsic::masked_expandload, ResultTy,
+        Intrinsic::masked_expandload, {ResultTy, PtrTy},
         {CI->getOperand(0), MaskVec, CI->getOperand(1)});
   } else if (Name.starts_with("avx512.mask.compress.store.")) {
     auto *ResultTy = cast<VectorType>(CI->getArgOperand(1)->getType());
+    auto *PtrTy = CI->getArgOperand(0)->getType();
     Value *MaskVec =
         getX86MaskVec(Builder, CI->getArgOperand(2),
                       cast<FixedVectorType>(ResultTy)->getNumElements());
-
     Rep = Builder.CreateIntrinsic(
-        Intrinsic::masked_compressstore, ResultTy,
+        Intrinsic::masked_compressstore, {ResultTy, PtrTy},
         {CI->getArgOperand(1), CI->getArgOperand(0), MaskVec});
   } else if (Name.starts_with("avx512.mask.compress.") ||
              Name.starts_with("avx512.mask.expand.")) {

diff  --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index a52c3db9dad97..876d642b98fb0 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -697,7 +697,8 @@ CallInst *IRBuilderBase::CreateMaskedExpandLoad(Type *Ty, Value *Ptr,
   assert(Mask && "Mask should not be all-ones (null)");
   if (!PassThru)
     PassThru = PoisonValue::get(Ty);
-  Type *OverloadedTypes[] = {Ty};
+  Type *PtrTy = Ptr->getType();
+  Type *OverloadedTypes[] = {Ty, PtrTy};
   Value *Ops[] = {Ptr, Mask, PassThru};
   CallInst *CI = CreateMaskedIntrinsic(Intrinsic::masked_expandload, Ops,
                                        OverloadedTypes, Name);
@@ -718,7 +719,8 @@ CallInst *IRBuilderBase::CreateMaskedCompressStore(Value *Val, Value *Ptr,
   Type *DataTy = Val->getType();
   assert(DataTy->isVectorTy() && "Val should be a vector");
   assert(Mask && "Mask should not be all-ones (null)");
-  Type *OverloadedTypes[] = {DataTy};
+  Type *PtrTy = Ptr->getType();
+  Type *OverloadedTypes[] = {DataTy, PtrTy};
   Value *Ops[] = {Val, Ptr, Mask};
   CallInst *CI = CreateMaskedIntrinsic(Intrinsic::masked_compressstore, Ops,
                                        OverloadedTypes);

diff  --git a/llvm/test/Analysis/CostModel/AArch64/masked_expand_load.ll b/llvm/test/Analysis/CostModel/AArch64/masked_expand_load.ll
index b33f73a1fef58..342b4b31abbd3 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_expand_load.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_expand_load.ll
@@ -8,274 +8,274 @@
 
 define void @fixed() {
 ; SVE2p2-SME2p2-NON-STREAMING-LABEL: 'fixed'
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i8 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i8 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:64 SizeLat:40 for: %v8i8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:48 Lat:128 SizeLat:80 for: %v16i8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i16 = call <2 x i16> @llvm.masked.expandload.v2i16(ptr poison, <2 x i1> poison, <2 x i16> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr poison, <4 x i1> poison, <4 x i16> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:64 SizeLat:40 for: %v8i16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr poison, <8 x i1> poison, <8 x i16> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr poison, <2 x i1> poison, <2 x i32> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr poison, <4 x i1> poison, <4 x i32> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:14 SizeLat:8 for: %v2f16 = call <2 x half> @llvm.masked.expandload.v2f16(ptr poison, <2 x i1> poison, <2 x half> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:30 SizeLat:18 for: %v4f16 = call <4 x half> @llvm.masked.expandload.v4f16(ptr poison, <4 x i1> poison, <4 x half> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:23 Lat:62 SizeLat:38 for: %v8f16 = call <8 x half> @llvm.masked.expandload.v8f16(ptr poison, <8 x i1> poison, <8 x half> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:14 SizeLat:8 for: %v2f32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr poison, <2 x i1> poison, <2 x float> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:30 SizeLat:18 for: %v4f32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr poison, <4 x i1> poison, <4 x float> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:14 SizeLat:8 for: %v2f64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr poison, <2 x i1> poison, <2 x double> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:92 Lat:248 SizeLat:152 for: %v32f16 = call <32 x half> @llvm.masked.expandload.v32f16(ptr poison, <32 x i1> poison, <32 x half> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i8 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr poison, <2 x i1> poison, <2 x i8> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i8 = call <4 x i8> @llvm.masked.expandload.v4i8.p0(ptr poison, <4 x i1> poison, <4 x i8> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:64 SizeLat:40 for: %v8i8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr poison, <8 x i1> poison, <8 x i8> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:48 Lat:128 SizeLat:80 for: %v16i8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr poison, <16 x i1> poison, <16 x i8> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i16 = call <2 x i16> @llvm.masked.expandload.v2i16.p0(ptr poison, <2 x i1> poison, <2 x i16> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr poison, <4 x i1> poison, <4 x i16> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:64 SizeLat:40 for: %v8i16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr poison, <8 x i1> poison, <8 x i16> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr poison, <2 x i1> poison, <2 x i32> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr poison, <4 x i1> poison, <4 x i32> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr poison, <2 x i1> poison, <2 x i64> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:14 SizeLat:8 for: %v2f16 = call <2 x half> @llvm.masked.expandload.v2f16.p0(ptr poison, <2 x i1> poison, <2 x half> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:30 SizeLat:18 for: %v4f16 = call <4 x half> @llvm.masked.expandload.v4f16.p0(ptr poison, <4 x i1> poison, <4 x half> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:23 Lat:62 SizeLat:38 for: %v8f16 = call <8 x half> @llvm.masked.expandload.v8f16.p0(ptr poison, <8 x i1> poison, <8 x half> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:14 SizeLat:8 for: %v2f32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr poison, <2 x i1> poison, <2 x float> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:30 SizeLat:18 for: %v4f32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr poison, <4 x i1> poison, <4 x float> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:14 SizeLat:8 for: %v2f64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr poison, <2 x i1> poison, <2 x double> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr poison, <4 x i1> poison, <4 x i64> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:92 Lat:248 SizeLat:152 for: %v32f16 = call <32 x half> @llvm.masked.expandload.v32f16.p0(ptr poison, <32 x i1> poison, <32 x half> poison)
 ; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SVE2p2-SME2p2-STREAMING-LABEL: 'fixed'
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2i8 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v4i8 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v8i8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v16i8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2i16 = call <2 x i16> @llvm.masked.expandload.v2i16(ptr poison, <2 x i1> poison, <2 x i16> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v4i16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr poison, <4 x i1> poison, <4 x i16> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v8i16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr poison, <8 x i1> poison, <8 x i16> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2i32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr poison, <2 x i1> poison, <2 x i32> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v4i32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr poison, <4 x i1> poison, <4 x i32> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2i64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2f16 = call <2 x half> @llvm.masked.expandload.v2f16(ptr poison, <2 x i1> poison, <2 x half> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v4f16 = call <4 x half> @llvm.masked.expandload.v4f16(ptr poison, <4 x i1> poison, <4 x half> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v8f16 = call <8 x half> @llvm.masked.expandload.v8f16(ptr poison, <8 x i1> poison, <8 x half> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2f32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr poison, <2 x i1> poison, <2 x float> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v4f32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr poison, <4 x i1> poison, <4 x float> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2f64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr poison, <2 x i1> poison, <2 x double> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 8 for: %v4i64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 16 for: %v32f16 = call <32 x half> @llvm.masked.expandload.v32f16(ptr poison, <32 x i1> poison, <32 x half> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2i8 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr poison, <2 x i1> poison, <2 x i8> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v4i8 = call <4 x i8> @llvm.masked.expandload.v4i8.p0(ptr poison, <4 x i1> poison, <4 x i8> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v8i8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr poison, <8 x i1> poison, <8 x i8> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v16i8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr poison, <16 x i1> poison, <16 x i8> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2i16 = call <2 x i16> @llvm.masked.expandload.v2i16.p0(ptr poison, <2 x i1> poison, <2 x i16> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v4i16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr poison, <4 x i1> poison, <4 x i16> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v8i16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr poison, <8 x i1> poison, <8 x i16> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2i32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr poison, <2 x i1> poison, <2 x i32> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v4i32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr poison, <4 x i1> poison, <4 x i32> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2i64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr poison, <2 x i1> poison, <2 x i64> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2f16 = call <2 x half> @llvm.masked.expandload.v2f16.p0(ptr poison, <2 x i1> poison, <2 x half> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v4f16 = call <4 x half> @llvm.masked.expandload.v4f16.p0(ptr poison, <4 x i1> poison, <4 x half> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v8f16 = call <8 x half> @llvm.masked.expandload.v8f16.p0(ptr poison, <8 x i1> poison, <8 x half> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2f32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr poison, <2 x i1> poison, <2 x float> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v4f32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr poison, <4 x i1> poison, <4 x float> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %v2f64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr poison, <2 x i1> poison, <2 x double> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 8 for: %v4i64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr poison, <4 x i1> poison, <4 x i64> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 16 for: %v32f16 = call <32 x half> @llvm.masked.expandload.v32f16.p0(ptr poison, <32 x i1> poison, <32 x half> poison)
 ; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SVE2p2-SME2p2-LABEL: 'fixed'
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i8 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i8 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:64 SizeLat:40 for: %v8i8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:48 Lat:128 SizeLat:80 for: %v16i8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i16 = call <2 x i16> @llvm.masked.expandload.v2i16(ptr poison, <2 x i1> poison, <2 x i16> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr poison, <4 x i1> poison, <4 x i16> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:64 SizeLat:40 for: %v8i16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr poison, <8 x i1> poison, <8 x i16> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr poison, <2 x i1> poison, <2 x i32> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr poison, <4 x i1> poison, <4 x i32> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:14 SizeLat:8 for: %v2f16 = call <2 x half> @llvm.masked.expandload.v2f16(ptr poison, <2 x i1> poison, <2 x half> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:30 SizeLat:18 for: %v4f16 = call <4 x half> @llvm.masked.expandload.v4f16(ptr poison, <4 x i1> poison, <4 x half> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:23 Lat:62 SizeLat:38 for: %v8f16 = call <8 x half> @llvm.masked.expandload.v8f16(ptr poison, <8 x i1> poison, <8 x half> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:14 SizeLat:8 for: %v2f32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr poison, <2 x i1> poison, <2 x float> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:30 SizeLat:18 for: %v4f32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr poison, <4 x i1> poison, <4 x float> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:14 SizeLat:8 for: %v2f64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr poison, <2 x i1> poison, <2 x double> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:92 Lat:248 SizeLat:152 for: %v32f16 = call <32 x half> @llvm.masked.expandload.v32f16(ptr poison, <32 x i1> poison, <32 x half> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i8 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr poison, <2 x i1> poison, <2 x i8> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i8 = call <4 x i8> @llvm.masked.expandload.v4i8.p0(ptr poison, <4 x i1> poison, <4 x i8> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:64 SizeLat:40 for: %v8i8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr poison, <8 x i1> poison, <8 x i8> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:48 Lat:128 SizeLat:80 for: %v16i8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr poison, <16 x i1> poison, <16 x i8> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i16 = call <2 x i16> @llvm.masked.expandload.v2i16.p0(ptr poison, <2 x i1> poison, <2 x i16> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr poison, <4 x i1> poison, <4 x i16> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:24 Lat:64 SizeLat:40 for: %v8i16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr poison, <8 x i1> poison, <8 x i16> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr poison, <2 x i1> poison, <2 x i32> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr poison, <4 x i1> poison, <4 x i32> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:16 SizeLat:10 for: %v2i64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr poison, <2 x i1> poison, <2 x i64> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:14 SizeLat:8 for: %v2f16 = call <2 x half> @llvm.masked.expandload.v2f16.p0(ptr poison, <2 x i1> poison, <2 x half> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:30 SizeLat:18 for: %v4f16 = call <4 x half> @llvm.masked.expandload.v4f16.p0(ptr poison, <4 x i1> poison, <4 x half> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:23 Lat:62 SizeLat:38 for: %v8f16 = call <8 x half> @llvm.masked.expandload.v8f16.p0(ptr poison, <8 x i1> poison, <8 x half> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:14 SizeLat:8 for: %v2f32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr poison, <2 x i1> poison, <2 x float> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:11 Lat:30 SizeLat:18 for: %v4f32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr poison, <4 x i1> poison, <4 x float> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:14 SizeLat:8 for: %v2f64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr poison, <2 x i1> poison, <2 x double> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:32 SizeLat:20 for: %v4i64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr poison, <4 x i1> poison, <4 x i64> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:92 Lat:248 SizeLat:152 for: %v32f16 = call <32 x half> @llvm.masked.expandload.v32f16.p0(ptr poison, <32 x i1> poison, <32 x half> poison)
 ; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SVE-ONLY-LABEL: 'fixed'
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2i8 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v4i8 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v8i8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v16i8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2i16 = call <2 x i16> @llvm.masked.expandload.v2i16(ptr poison, <2 x i1> poison, <2 x i16> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v4i16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr poison, <4 x i1> poison, <4 x i16> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v8i16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr poison, <8 x i1> poison, <8 x i16> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2i32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr poison, <2 x i1> poison, <2 x i32> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v4i32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr poison, <4 x i1> poison, <4 x i32> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2i64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2f16 = call <2 x half> @llvm.masked.expandload.v2f16(ptr poison, <2 x i1> poison, <2 x half> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v4f16 = call <4 x half> @llvm.masked.expandload.v4f16(ptr poison, <4 x i1> poison, <4 x half> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v8f16 = call <8 x half> @llvm.masked.expandload.v8f16(ptr poison, <8 x i1> poison, <8 x half> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2f32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr poison, <2 x i1> poison, <2 x float> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v4f32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr poison, <4 x i1> poison, <4 x float> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2f64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr poison, <2 x i1> poison, <2 x double> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v4i64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v32f16 = call <32 x half> @llvm.masked.expandload.v32f16(ptr poison, <32 x i1> poison, <32 x half> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2i8 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr poison, <2 x i1> poison, <2 x i8> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v4i8 = call <4 x i8> @llvm.masked.expandload.v4i8.p0(ptr poison, <4 x i1> poison, <4 x i8> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v8i8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr poison, <8 x i1> poison, <8 x i8> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v16i8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr poison, <16 x i1> poison, <16 x i8> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2i16 = call <2 x i16> @llvm.masked.expandload.v2i16.p0(ptr poison, <2 x i1> poison, <2 x i16> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v4i16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr poison, <4 x i1> poison, <4 x i16> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v8i16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr poison, <8 x i1> poison, <8 x i16> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2i32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr poison, <2 x i1> poison, <2 x i32> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v4i32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr poison, <4 x i1> poison, <4 x i32> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2i64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr poison, <2 x i1> poison, <2 x i64> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2f16 = call <2 x half> @llvm.masked.expandload.v2f16.p0(ptr poison, <2 x i1> poison, <2 x half> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v4f16 = call <4 x half> @llvm.masked.expandload.v4f16.p0(ptr poison, <4 x i1> poison, <4 x half> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v8f16 = call <8 x half> @llvm.masked.expandload.v8f16.p0(ptr poison, <8 x i1> poison, <8 x half> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2f32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr poison, <2 x i1> poison, <2 x float> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v4f32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr poison, <4 x i1> poison, <4 x float> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v2f64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr poison, <2 x i1> poison, <2 x double> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v4i64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr poison, <4 x i1> poison, <4 x i64> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %v32f16 = call <32 x half> @llvm.masked.expandload.v32f16.p0(ptr poison, <32 x i1> poison, <32 x half> poison)
 ; SVE-ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SVE2p2-SME2p2-SVE256-LABEL: 'fixed'
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2i8 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v4i8 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v8i8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v16i8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2i16 = call <2 x i16> @llvm.masked.expandload.v2i16(ptr poison, <2 x i1> poison, <2 x i16> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v4i16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr poison, <4 x i1> poison, <4 x i16> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v8i16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr poison, <8 x i1> poison, <8 x i16> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2i32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr poison, <2 x i1> poison, <2 x i32> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v4i32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr poison, <4 x i1> poison, <4 x i32> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2i64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2f16 = call <2 x half> @llvm.masked.expandload.v2f16(ptr poison, <2 x i1> poison, <2 x half> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v4f16 = call <4 x half> @llvm.masked.expandload.v4f16(ptr poison, <4 x i1> poison, <4 x half> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v8f16 = call <8 x half> @llvm.masked.expandload.v8f16(ptr poison, <8 x i1> poison, <8 x half> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2f32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr poison, <2 x i1> poison, <2 x float> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v4f32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr poison, <4 x i1> poison, <4 x float> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2f64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr poison, <2 x i1> poison, <2 x double> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v4i64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 8 for: %v32f16 = call <32 x half> @llvm.masked.expandload.v32f16(ptr poison, <32 x i1> poison, <32 x half> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2i8 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr poison, <2 x i1> poison, <2 x i8> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v4i8 = call <4 x i8> @llvm.masked.expandload.v4i8.p0(ptr poison, <4 x i1> poison, <4 x i8> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v8i8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr poison, <8 x i1> poison, <8 x i8> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v16i8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr poison, <16 x i1> poison, <16 x i8> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2i16 = call <2 x i16> @llvm.masked.expandload.v2i16.p0(ptr poison, <2 x i1> poison, <2 x i16> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v4i16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr poison, <4 x i1> poison, <4 x i16> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v8i16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr poison, <8 x i1> poison, <8 x i16> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2i32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr poison, <2 x i1> poison, <2 x i32> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v4i32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr poison, <4 x i1> poison, <4 x i32> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2i64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr poison, <2 x i1> poison, <2 x i64> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2f16 = call <2 x half> @llvm.masked.expandload.v2f16.p0(ptr poison, <2 x i1> poison, <2 x half> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v4f16 = call <4 x half> @llvm.masked.expandload.v4f16.p0(ptr poison, <4 x i1> poison, <4 x half> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v8f16 = call <8 x half> @llvm.masked.expandload.v8f16.p0(ptr poison, <8 x i1> poison, <8 x half> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2f32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr poison, <2 x i1> poison, <2 x float> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v4f32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr poison, <4 x i1> poison, <4 x float> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v2f64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr poison, <2 x i1> poison, <2 x double> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %v4i64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr poison, <4 x i1> poison, <4 x i64> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 8 for: %v32f16 = call <32 x half> @llvm.masked.expandload.v32f16.p0(ptr poison, <32 x i1> poison, <32 x half> poison)
 ; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
   ; Legal fixed-width integer types
-  %v2i8 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr poison, <2 x i1> poison, <2 x i8> poison)
-  %v4i8 = call <4 x i8> @llvm.masked.expandload.v4i8.p0(ptr poison, <4 x i1> poison, <4 x i8> poison)
-  %v8i8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr poison, <8 x i1> poison, <8 x i8> poison)
-  %v16i8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr poison, <16 x i1> poison, <16 x i8> poison)
-  %v2i16 = call <2 x i16> @llvm.masked.expandload.v2i16.p0(ptr poison, <2 x i1> poison, <2 x i16> poison)
-  %v4i16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr poison, <4 x i1> poison, <4 x i16> poison)
-  %v8i16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr poison, <8 x i1> poison, <8 x i16> poison)
-  %v2i32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr poison, <2 x i1> poison, <2 x i32> poison)
-  %v4i32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr poison, <4 x i1> poison, <4 x i32> poison)
-  %v2i64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr poison, <2 x i1> poison, <2 x i64> poison)
+  %v2i8 = call <2 x i8> @llvm.masked.expandload.v2i8.p0.p0(ptr poison, <2 x i1> poison, <2 x i8> poison)
+  %v4i8 = call <4 x i8> @llvm.masked.expandload.v4i8.p0.p0(ptr poison, <4 x i1> poison, <4 x i8> poison)
+  %v8i8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0.p0(ptr poison, <8 x i1> poison, <8 x i8> poison)
+  %v16i8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0.p0(ptr poison, <16 x i1> poison, <16 x i8> poison)
+  %v2i16 = call <2 x i16> @llvm.masked.expandload.v2i16.p0.p0(ptr poison, <2 x i1> poison, <2 x i16> poison)
+  %v4i16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0.p0(ptr poison, <4 x i1> poison, <4 x i16> poison)
+  %v8i16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0.p0(ptr poison, <8 x i1> poison, <8 x i16> poison)
+  %v2i32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0.p0(ptr poison, <2 x i1> poison, <2 x i32> poison)
+  %v4i32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0.p0(ptr poison, <4 x i1> poison, <4 x i32> poison)
+  %v2i64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0.p0(ptr poison, <2 x i1> poison, <2 x i64> poison)
 
   ; Legal fixed-width floating point types
-  %v2f16 = call <2 x half> @llvm.masked.expandload.v2f16.p0(ptr poison, <2 x i1> poison, <2 x half> poison)
-  %v4f16 = call <4 x half> @llvm.masked.expandload.v4f16.p0(ptr poison, <4 x i1> poison, <4 x half> poison)
-  %v8f16 = call <8 x half> @llvm.masked.expandload.v8f16.p0(ptr poison, <8 x i1> poison, <8 x half> poison)
-  %v2f32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr poison, <2 x i1> poison, <2 x float> poison)
-  %v4f32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr poison, <4 x i1> poison, <4 x float> poison)
-  %v2f64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr poison, <2 x i1> poison, <2 x double> poison)
+  %v2f16 = call <2 x half> @llvm.masked.expandload.v2f16.p0.p0(ptr poison, <2 x i1> poison, <2 x half> poison)
+  %v4f16 = call <4 x half> @llvm.masked.expandload.v4f16.p0.p0(ptr poison, <4 x i1> poison, <4 x half> poison)
+  %v8f16 = call <8 x half> @llvm.masked.expandload.v8f16.p0.p0(ptr poison, <8 x i1> poison, <8 x half> poison)
+  %v2f32 = call <2 x float> @llvm.masked.expandload.v2f32.p0.p0(ptr poison, <2 x i1> poison, <2 x float> poison)
+  %v4f32 = call <4 x float> @llvm.masked.expandload.v4f32.p0.p0(ptr poison, <4 x i1> poison, <4 x float> poison)
+  %v2f64 = call <2 x double> @llvm.masked.expandload.v2f64.p0.p0(ptr poison, <2 x i1> poison, <2 x double> poison)
 
   ; Examples of illegal fixed-width types
-  %v4i64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr poison, <4 x i1> poison, <4 x i64> poison)
-  %v32f16 = call <32 x half> @llvm.masked.expandload.v32f16.p0(ptr poison, <32 x i1> poison, <32 x half> poison)
+  %v4i64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0.p0(ptr poison, <4 x i1> poison, <4 x i64> poison)
+  %v32f16 = call <32 x half> @llvm.masked.expandload.v32f16.p0.p0(ptr poison, <32 x i1> poison, <32 x half> poison)
 
   ret void
 }
 
 define void @scalable() {
 ; SVE2p2-SME2p2-NON-STREAMING-LABEL: 'scalable'
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.expandload.nxv2i16(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i16> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.expandload.nxv4i16(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i16> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.expandload.nxv8i16(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i16> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.expandload.nxv2i32(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i32> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.expandload.nxv4i32(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i32> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.expandload.nxv2f16(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x half> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.expandload.nxv4f16(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x half> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.expandload.nxv8f16(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x half> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.expandload.nxv2f32(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x float> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x float> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.expandload.nxv2f64(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x double> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.expandload.nxv1i64(ptr poison, <vscale x 1 x i1> poison, <vscale x 1 x i64> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 8 for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
-; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 16 for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.expandload.nxv32f16(ptr poison, <vscale x 32 x i1> poison, <vscale x 32 x half> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8.p0(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.expandload.nxv2i16.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i16> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.expandload.nxv4i16.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i16> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.expandload.nxv8i16.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i16> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.expandload.nxv2i32.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i32> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.expandload.nxv4i32.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i32> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.expandload.nxv2f16.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x half> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.expandload.nxv4f16.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x half> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.expandload.nxv8f16.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x half> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.expandload.nxv2f32.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x float> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x float> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.expandload.nxv2f64.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x double> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.expandload.nxv1i64.p0(ptr poison, <vscale x 1 x i1> poison, <vscale x 1 x i64> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 8 for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of 16 for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.expandload.nxv32f16.p0(ptr poison, <vscale x 32 x i1> poison, <vscale x 32 x half> poison)
 ; SVE2p2-SME2p2-NON-STREAMING-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SVE2p2-SME2p2-STREAMING-LABEL: 'scalable'
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.expandload.nxv2i16(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i16> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.expandload.nxv4i16(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i16> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.expandload.nxv8i16(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i16> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.expandload.nxv2i32(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i32> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.expandload.nxv4i32(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i32> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.expandload.nxv2f16(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x half> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.expandload.nxv4f16(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x half> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.expandload.nxv8f16(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x half> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.expandload.nxv2f32(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x float> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x float> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.expandload.nxv2f64(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x double> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.expandload.nxv1i64(ptr poison, <vscale x 1 x i1> poison, <vscale x 1 x i64> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 8 for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
-; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 16 for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.expandload.nxv32f16(ptr poison, <vscale x 32 x i1> poison, <vscale x 32 x half> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8.p0(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.expandload.nxv2i16.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i16> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.expandload.nxv4i16.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i16> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.expandload.nxv8i16.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i16> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.expandload.nxv2i32.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i32> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.expandload.nxv4i32.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i32> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.expandload.nxv2f16.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x half> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.expandload.nxv4f16.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x half> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.expandload.nxv8f16.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x half> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.expandload.nxv2f32.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x float> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x float> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 2 for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.expandload.nxv2f64.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x double> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.expandload.nxv1i64.p0(ptr poison, <vscale x 1 x i1> poison, <vscale x 1 x i64> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 8 for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of 16 for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.expandload.nxv32f16.p0(ptr poison, <vscale x 32 x i1> poison, <vscale x 32 x half> poison)
 ; SVE2p2-SME2p2-STREAMING-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SVE2p2-SME2p2-LABEL: 'scalable'
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.expandload.nxv2i16(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i16> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.expandload.nxv4i16(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i16> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.expandload.nxv8i16(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i16> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.expandload.nxv2i32(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i32> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.expandload.nxv4i32(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i32> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.expandload.nxv2f16(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x half> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.expandload.nxv4f16(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x half> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.expandload.nxv8f16(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x half> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.expandload.nxv2f32(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x float> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x float> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.expandload.nxv2f64(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x double> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.expandload.nxv1i64(ptr poison, <vscale x 1 x i1> poison, <vscale x 1 x i64> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 8 for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
-; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 16 for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.expandload.nxv32f16(ptr poison, <vscale x 32 x i1> poison, <vscale x 32 x half> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8.p0(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.expandload.nxv2i16.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i16> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.expandload.nxv4i16.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i16> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.expandload.nxv8i16.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i16> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.expandload.nxv2i32.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i32> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.expandload.nxv4i32.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i32> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.expandload.nxv2f16.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x half> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.expandload.nxv4f16.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x half> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.expandload.nxv8f16.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x half> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.expandload.nxv2f32.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x float> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x float> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 2 for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.expandload.nxv2f64.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x double> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.expandload.nxv1i64.p0(ptr poison, <vscale x 1 x i1> poison, <vscale x 1 x i64> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 8 for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of 16 for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.expandload.nxv32f16.p0(ptr poison, <vscale x 32 x i1> poison, <vscale x 32 x half> poison)
 ; SVE2p2-SME2p2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SVE-ONLY-LABEL: 'scalable'
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.expandload.nxv2i16(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i16> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.expandload.nxv4i16(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i16> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.expandload.nxv8i16(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i16> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.expandload.nxv2i32(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i32> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.expandload.nxv4i32(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i32> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.expandload.nxv2f16(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x half> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.expandload.nxv4f16(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x half> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.expandload.nxv8f16(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x half> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.expandload.nxv2f32(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x float> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x float> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.expandload.nxv2f64(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x double> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.expandload.nxv1i64(ptr poison, <vscale x 1 x i1> poison, <vscale x 1 x i64> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
-; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.expandload.nxv32f16(ptr poison, <vscale x 32 x i1> poison, <vscale x 32 x half> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8.p0(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.expandload.nxv2i16.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i16> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.expandload.nxv4i16.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i16> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.expandload.nxv8i16.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i16> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.expandload.nxv2i32.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i32> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.expandload.nxv4i32.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i32> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.expandload.nxv2f16.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x half> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.expandload.nxv4f16.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x half> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.expandload.nxv8f16.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x half> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.expandload.nxv2f32.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x float> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x float> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.expandload.nxv2f64.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x double> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.expandload.nxv1i64.p0(ptr poison, <vscale x 1 x i1> poison, <vscale x 1 x i64> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+; SVE-ONLY-NEXT:  Cost Model: Found costs of Invalid for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.expandload.nxv32f16.p0(ptr poison, <vscale x 32 x i1> poison, <vscale x 32 x half> poison)
 ; SVE-ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SVE2p2-SME2p2-SVE256-LABEL: 'scalable'
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.expandload.nxv2i16(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i16> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.expandload.nxv4i16(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i16> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.expandload.nxv8i16(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i16> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.expandload.nxv2i32(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i32> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.expandload.nxv4i32(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i32> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.expandload.nxv2f16(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x half> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.expandload.nxv4f16(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x half> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.expandload.nxv8f16(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x half> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.expandload.nxv2f32(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x float> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x float> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.expandload.nxv2f64(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x double> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.expandload.nxv1i64(ptr poison, <vscale x 1 x i1> poison, <vscale x 1 x i64> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 8 for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
-; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 16 for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.expandload.nxv32f16(ptr poison, <vscale x 32 x i1> poison, <vscale x 32 x half> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8.p0(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.expandload.nxv2i16.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i16> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.expandload.nxv4i16.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i16> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.expandload.nxv8i16.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i16> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.expandload.nxv2i32.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i32> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.expandload.nxv4i32.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i32> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.expandload.nxv2f16.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x half> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.expandload.nxv4f16.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x half> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.expandload.nxv8f16.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x half> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.expandload.nxv2f32.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x float> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x float> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 2 for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.expandload.nxv2f64.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x double> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.expandload.nxv1i64.p0(ptr poison, <vscale x 1 x i1> poison, <vscale x 1 x i64> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 8 for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of 16 for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.expandload.nxv32f16.p0(ptr poison, <vscale x 32 x i1> poison, <vscale x 32 x half> poison)
 ; SVE2p2-SME2p2-SVE256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
   ; Legal scalable integer types
-  %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
-  %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
-  %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
-  %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8.p0(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
-  %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.expandload.nxv2i16.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i16> poison)
-  %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.expandload.nxv4i16.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i16> poison)
-  %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.expandload.nxv8i16.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i16> poison)
-  %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.expandload.nxv2i32.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i32> poison)
-  %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.expandload.nxv4i32.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i32> poison)
-  %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+  %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8.p0.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+  %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8.p0.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+  %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8.p0.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+  %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8.p0.p0(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+  %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.expandload.nxv2i16.p0.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i16> poison)
+  %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.expandload.nxv4i16.p0.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i16> poison)
+  %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.expandload.nxv8i16.p0.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i16> poison)
+  %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.expandload.nxv2i32.p0.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i32> poison)
+  %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.expandload.nxv4i32.p0.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i32> poison)
+  %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64.p0.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
 
   ; Legal scalable floating point types
-  %nxv2f16 = call <vscale x 2 x half> @llvm.masked.expandload.nxv2f16.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x half> poison)
-  %nxv4f16 = call <vscale x 4 x half> @llvm.masked.expandload.nxv4f16.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x half> poison)
-  %nxv8f16 = call <vscale x 8 x half> @llvm.masked.expandload.nxv8f16.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x half> poison)
-  %nxv2f32 = call <vscale x 2 x float> @llvm.masked.expandload.nxv2f32.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x float> poison)
-  %nxv4f32 = call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x float> poison)
-  %nxv2f64 = call <vscale x 2 x double> @llvm.masked.expandload.nxv2f64.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x double> poison)
+  %nxv2f16 = call <vscale x 2 x half> @llvm.masked.expandload.nxv2f16.p0.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x half> poison)
+  %nxv4f16 = call <vscale x 4 x half> @llvm.masked.expandload.nxv4f16.p0.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x half> poison)
+  %nxv8f16 = call <vscale x 8 x half> @llvm.masked.expandload.nxv8f16.p0.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x half> poison)
+  %nxv2f32 = call <vscale x 2 x float> @llvm.masked.expandload.nxv2f32.p0.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x float> poison)
+  %nxv4f32 = call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32.p0.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x float> poison)
+  %nxv2f64 = call <vscale x 2 x double> @llvm.masked.expandload.nxv2f64.p0.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x double> poison)
 
   ; Examples of illegal scalable types
-  %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.expandload.nxv1i64.p0(ptr poison, <vscale x 1 x i1> poison, <vscale x 1 x i64> poison)
-  %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
-  %nxv32f16 = call <vscale x 32 x half> @llvm.masked.expandload.nxv32f16.p0(ptr poison, <vscale x 32 x i1> poison, <vscale x 32 x half> poison)
+  %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.expandload.nxv1i64.p0.p0(ptr poison, <vscale x 1 x i1> poison, <vscale x 1 x i64> poison)
+  %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64.p0.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+  %nxv32f16 = call <vscale x 32 x half> @llvm.masked.expandload.nxv32f16.p0.p0(ptr poison, <vscale x 32 x i1> poison, <vscale x 32 x half> poison)
 
   ret void
 }

diff  --git a/llvm/test/Analysis/CostModel/PowerPC/ld-st-with-length.ll b/llvm/test/Analysis/CostModel/PowerPC/ld-st-with-length.ll
index 0f854781c02ae..870d2cb04ac10 100644
--- a/llvm/test/Analysis/CostModel/PowerPC/ld-st-with-length.ll
+++ b/llvm/test/Analysis/CostModel/PowerPC/ld-st-with-length.ll
@@ -35,8 +35,8 @@ define void @bar(ptr %base, <2 x i8> %val) {
 ; P9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.vp.store.v3i64.p0(<3 x i64> %x11, ptr %base, <3 x i1> splat (i1 true), i32 1)
 ; P9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %x12 = call <4 x i15> @llvm.vp.load.v4i15.p0(ptr %base, <4 x i1> splat (i1 true), i32 1)
 ; P9-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.vp.store.v4i15.p0(<4 x i15> %x12, ptr %base, <4 x i1> splat (i1 true), i32 1)
-; P9-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
-; P9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
+; P9-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
+; P9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
 ; P9-NEXT:  Cost Model: Invalid cost for instruction: %x14 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; P9-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> %x14, ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; P9-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -66,8 +66,8 @@ define void @bar(ptr %base, <2 x i8> %val) {
 ; P9BE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.vp.store.v3i64.p0(<3 x i64> %x11, ptr %base, <3 x i1> splat (i1 true), i32 1)
 ; P9BE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %x12 = call <4 x i15> @llvm.vp.load.v4i15.p0(ptr %base, <4 x i1> splat (i1 true), i32 1)
 ; P9BE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.vp.store.v4i15.p0(<4 x i15> %x12, ptr %base, <4 x i1> splat (i1 true), i32 1)
-; P9BE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
-; P9BE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
+; P9BE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
+; P9BE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
 ; P9BE-NEXT:  Cost Model: Invalid cost for instruction: %x14 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; P9BE-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> %x14, ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; P9BE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -97,8 +97,8 @@ define void @bar(ptr %base, <2 x i8> %val) {
 ; P932-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.vp.store.v3i64.p0(<3 x i64> %x11, ptr %base, <3 x i1> splat (i1 true), i32 1)
 ; P932-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %x12 = call <4 x i15> @llvm.vp.load.v4i15.p0(ptr %base, <4 x i1> splat (i1 true), i32 1)
 ; P932-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.v4i15.p0(<4 x i15> %x12, ptr %base, <4 x i1> splat (i1 true), i32 1)
-; P932-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
-; P932-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
+; P932-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
+; P932-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
 ; P932-NEXT:  Cost Model: Invalid cost for instruction: %x14 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; P932-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> %x14, ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; P932-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -128,8 +128,8 @@ define void @bar(ptr %base, <2 x i8> %val) {
 ; P10-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v3i64.p0(<3 x i64> %x11, ptr %base, <3 x i1> splat (i1 true), i32 1)
 ; P10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %x12 = call <4 x i15> @llvm.vp.load.v4i15.p0(ptr %base, <4 x i1> splat (i1 true), i32 1)
 ; P10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.v4i15.p0(<4 x i15> %x12, ptr %base, <4 x i1> splat (i1 true), i32 1)
-; P10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
-; P10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
+; P10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
+; P10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
 ; P10-NEXT:  Cost Model: Invalid cost for instruction: %x14 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; P10-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> %x14, ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -159,8 +159,8 @@ define void @bar(ptr %base, <2 x i8> %val) {
 ; P10BE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v3i64.p0(<3 x i64> %x11, ptr %base, <3 x i1> splat (i1 true), i32 1)
 ; P10BE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %x12 = call <4 x i15> @llvm.vp.load.v4i15.p0(ptr %base, <4 x i1> splat (i1 true), i32 1)
 ; P10BE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.v4i15.p0(<4 x i15> %x12, ptr %base, <4 x i1> splat (i1 true), i32 1)
-; P10BE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
-; P10BE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
+; P10BE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
+; P10BE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
 ; P10BE-NEXT:  Cost Model: Invalid cost for instruction: %x14 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; P10BE-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> %x14, ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; P10BE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -190,8 +190,8 @@ define void @bar(ptr %base, <2 x i8> %val) {
 ; P1032-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.vp.store.v3i64.p0(<3 x i64> %x11, ptr %base, <3 x i1> splat (i1 true), i32 1)
 ; P1032-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %x12 = call <4 x i15> @llvm.vp.load.v4i15.p0(ptr %base, <4 x i1> splat (i1 true), i32 1)
 ; P1032-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.v4i15.p0(<4 x i15> %x12, ptr %base, <4 x i1> splat (i1 true), i32 1)
-; P1032-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
-; P1032-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
+; P1032-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
+; P1032-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
 ; P1032-NEXT:  Cost Model: Invalid cost for instruction: %x14 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; P1032-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> %x14, ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; P1032-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -221,8 +221,8 @@ define void @bar(ptr %base, <2 x i8> %val) {
 ; FUTURE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v3i64.p0(<3 x i64> %x11, ptr %base, <3 x i1> splat (i1 true), i32 1)
 ; FUTURE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %x12 = call <4 x i15> @llvm.vp.load.v4i15.p0(ptr %base, <4 x i1> splat (i1 true), i32 1)
 ; FUTURE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.v4i15.p0(<4 x i15> %x12, ptr %base, <4 x i1> splat (i1 true), i32 1)
-; FUTURE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
-; FUTURE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
+; FUTURE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
+; FUTURE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
 ; FUTURE-NEXT:  Cost Model: Invalid cost for instruction: %x14 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; FUTURE-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> %x14, ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; FUTURE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -252,8 +252,8 @@ define void @bar(ptr %base, <2 x i8> %val) {
 ; FUTUREBE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v3i64.p0(<3 x i64> %x11, ptr %base, <3 x i1> splat (i1 true), i32 1)
 ; FUTUREBE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %x12 = call <4 x i15> @llvm.vp.load.v4i15.p0(ptr %base, <4 x i1> splat (i1 true), i32 1)
 ; FUTUREBE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.v4i15.p0(<4 x i15> %x12, ptr %base, <4 x i1> splat (i1 true), i32 1)
-; FUTUREBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
-; FUTUREBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
+; FUTUREBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
+; FUTUREBE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
 ; FUTUREBE-NEXT:  Cost Model: Invalid cost for instruction: %x14 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; FUTUREBE-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> %x14, ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; FUTUREBE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -283,8 +283,8 @@ define void @bar(ptr %base, <2 x i8> %val) {
 ; FUTURE32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.vp.store.v3i64.p0(<3 x i64> %x11, ptr %base, <3 x i1> splat (i1 true), i32 1)
 ; FUTURE32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %x12 = call <4 x i15> @llvm.vp.load.v4i15.p0(ptr %base, <4 x i1> splat (i1 true), i32 1)
 ; FUTURE32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.v4i15.p0(<4 x i15> %x12, ptr %base, <4 x i1> splat (i1 true), i32 1)
-; FUTURE32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
-; FUTURE32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
+; FUTURE32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x13 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %base, <2 x i1> splat (i1 true), <2 x i8> zeroinitializer)
+; FUTURE32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> %x13, ptr %base, <2 x i1> splat (i1 true))
 ; FUTURE32-NEXT:  Cost Model: Invalid cost for instruction: %x14 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; FUTURE32-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> %x14, ptr %base, i64 1, <2 x i1> splat (i1 true), i32 1)
 ; FUTURE32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -313,8 +313,8 @@ define void @bar(ptr %base, <2 x i8> %val) {
   call void @llvm.vp.store.v3i64.p0(<3 x i64> %x11, ptr %base, <3 x i1> <i1 1, i1 1, i1 1>, i32 1)
   %x12 = call <4 x i15> @llvm.vp.load.v4i15.p0(ptr %base, <4 x i1> <i1 1, i1 1, i1 1, i1 1>, i32 1)
   call void @llvm.vp.store.v4i15.p0(<4 x i15> %x12, ptr %base, <4 x i1> <i1 1, i1 1, i1 1, i1 1>, i32 1)
-  %x13 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %base, <2 x i1> <i1 1, i1 1>, <2 x i8> <i8 0, i8 0>)
-  call void @llvm.masked.compressstore.v2i8(<2 x i8> %x13, ptr %base, <2 x i1> <i1 1, i1 1>)
+  %x13 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %base, <2 x i1> <i1 1, i1 1>, <2 x i8> <i8 0, i8 0>)
+  call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> %x13, ptr %base, <2 x i1> <i1 1, i1 1>)
   %x14 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.i64(ptr %base, i64 1, <2 x i1> <i1 1, i1 1>, i32 1)
   call void @llvm.experimental.vp.strided.store.v2i8.i64(<2 x i8> %x14, ptr %base, i64 1, <2 x i1> <i1 1, i1 1>, i32 1)
   ret void

diff  --git a/llvm/test/Analysis/CostModel/RISCV/gep.ll b/llvm/test/Analysis/CostModel/RISCV/gep.ll
index 12dffb43cba7d..6009ea4e5094a 100644
--- a/llvm/test/Analysis/CostModel/RISCV/gep.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/gep.ll
@@ -268,7 +268,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> <i32 42, i32 43>
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> align 1 %3, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = getelementptr i8, ptr %base, i32 42
@@ -280,7 +280,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> <i32 42, i32 43>
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> align 1 %9, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = getelementptr i8, ptr %base, i32 42
@@ -297,7 +297,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
   %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> %3, i32 1, <2 x i1> undef, <2 x i8> undef)
 
   %4 = getelementptr i8, ptr %base, i32 42
-  %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
+  %x4 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %4, <2 x i1> undef, <2 x i8> undef)
 
   %5 = getelementptr i8, ptr %base, i32 42
   %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
@@ -315,7 +315,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
   call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> %9, i32 1, <2 x i1> undef)
 
   %10 = getelementptr i8, ptr %base, i32 42
-  call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> undef, ptr %10, <2 x i1> undef)
 
   %11 = getelementptr i8, ptr %base, i32 42
   call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
@@ -338,7 +338,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %3 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> zeroinitializer
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> align 1 %3, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %4 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %5 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %6 = getelementptr i8, ptr %base, i32 0
@@ -350,7 +350,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %9 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> zeroinitializer
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> align 1 %9, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %10 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %11 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %12 = getelementptr i8, ptr %base, i32 0
@@ -367,7 +367,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
   %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> %3, i32 1, <2 x i1> undef, <2 x i8> undef)
 
   %4 = getelementptr i8, ptr %base, i32 0
-  %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
+  %x4 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr %4, <2 x i1> undef, <2 x i8> undef)
 
   %5 = getelementptr i8, ptr %base, i32 0
   %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
@@ -385,7 +385,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
   call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> %9, i32 1, <2 x i1> undef)
 
   %10 = getelementptr i8, ptr %base, i32 0
-  call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> undef, ptr %10, <2 x i1> undef)
 
   %11 = getelementptr i8, ptr %base, i32 0
   call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)

diff  --git a/llvm/test/Analysis/CostModel/RISCV/rvv-expandload-compressstore.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-expandload-compressstore.ll
index 94e2390a12fc1..2d43b48b3b1b3 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-expandload-compressstore.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-expandload-compressstore.ll
@@ -4,141 +4,141 @@
 
 define void @expand_load() {
 ; CHECK-LABEL: 'expand_load'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 poison, <2 x i1> poison, <2 x i64> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align 8 poison, <4 x i1> poison, <4 x i64> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align 8 poison, <8 x i1> poison, <8 x i64> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align 8 poison, <16 x i1> poison, <16 x i64> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr poison, <8 x i1> poison, <8 x i64> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr poison, <16 x i1> poison, <16 x i64> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align 8 poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align 8 poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align 8 poison, <vscale x 8 x i1> poison, <vscale x 8 x i64> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align 8 poison, <vscale x 16 x i1> poison, <vscale x 16 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t1 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr poison, <2 x i1> poison, <2 x i8> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t2 = call <4 x i8> @llvm.masked.expandload.v4i8.p0(ptr poison, <4 x i1> poison, <4 x i8> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t3 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr poison, <8 x i1> poison, <8 x i8> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t4 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr poison, <16 x i1> poison, <16 x i8> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t5 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr align 8 poison, <2 x i1> poison, <2 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %t6 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr align 8 poison, <4 x i1> poison, <4 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t7 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr align 8 poison, <8 x i1> poison, <8 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %t8 = call <16 x i64> @llvm.masked.expandload.v16i64.p0(ptr align 8 poison, <16 x i1> poison, <16 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %t9 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr poison, <2 x i1> poison, <2 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %t10 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr poison, <4 x i1> poison, <4 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %t11 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr poison, <8 x i1> poison, <8 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %t12 = call <16 x i64> @llvm.masked.expandload.v16i64.p0(ptr poison, <16 x i1> poison, <16 x i64> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8.p0(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64.p0(ptr align 8 poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64.p0(ptr align 8 poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64.p0(ptr align 8 poison, <vscale x 8 x i1> poison, <vscale x 8 x i64> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64.p0(ptr align 8 poison, <vscale x 16 x i1> poison, <vscale x 16 x i64> poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'expand_load'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 poison, <2 x i1> poison, <2 x i64> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align 8 poison, <4 x i1> poison, <4 x i64> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align 8 poison, <8 x i1> poison, <8 x i64> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align 8 poison, <16 x i1> poison, <16 x i64> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr poison, <8 x i1> poison, <8 x i64> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr poison, <16 x i1> poison, <16 x i64> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align 8 poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align 8 poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align 8 poison, <vscale x 8 x i1> poison, <vscale x 8 x i64> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align 8 poison, <vscale x 16 x i1> poison, <vscale x 16 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t1 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr poison, <2 x i1> poison, <2 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %t2 = call <4 x i8> @llvm.masked.expandload.v4i8.p0(ptr poison, <4 x i1> poison, <4 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %t3 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr poison, <8 x i1> poison, <8 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %t4 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr poison, <16 x i1> poison, <16 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t5 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr align 8 poison, <2 x i1> poison, <2 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %t6 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr align 8 poison, <4 x i1> poison, <4 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %t7 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr align 8 poison, <8 x i1> poison, <8 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %t8 = call <16 x i64> @llvm.masked.expandload.v16i64.p0(ptr align 8 poison, <16 x i1> poison, <16 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t9 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr poison, <2 x i1> poison, <2 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %t10 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr poison, <4 x i1> poison, <4 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %t11 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr poison, <8 x i1> poison, <8 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %t12 = call <16 x i64> @llvm.masked.expandload.v16i64.p0(ptr poison, <16 x i1> poison, <16 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8.p0(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64.p0(ptr align 8 poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64.p0(ptr align 8 poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64.p0(ptr align 8 poison, <vscale x 8 x i1> poison, <vscale x 8 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64.p0(ptr align 8 poison, <vscale x 16 x i1> poison, <vscale x 16 x i64> poison)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
-  %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
-  %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
-  %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
-  %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align(8) poison, <2 x i1> poison, <2 x i64> poison)
-  %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align(8) poison, <4 x i1> poison, <4 x i64> poison)
-  %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align(8) poison, <8 x i1> poison, <8 x i64> poison)
-  %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align(8) poison, <16 x i1> poison, <16 x i64> poison)
-  %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
-  %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
-  %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr poison, <8 x i1> poison, <8 x i64> poison)
-  %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr poison, <16 x i1> poison, <16 x i64> poison)
-  %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
-  %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
-  %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
-  %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
-  %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align(8) poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
-  %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align(8) poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
-  %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align(8) poison, <vscale x 8 x i1> poison, <vscale x 8 x i64> poison)
-  %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align(8) poison, <vscale x 16 x i1> poison, <vscale x 16 x i64> poison)
+  %t1 = call <2 x i8> @llvm.masked.expandload.v2i8.p0(ptr poison, <2 x i1> poison, <2 x i8> poison)
+  %t2 = call <4 x i8> @llvm.masked.expandload.v4i8.p0(ptr poison, <4 x i1> poison, <4 x i8> poison)
+  %t3 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr poison, <8 x i1> poison, <8 x i8> poison)
+  %t4 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr poison, <16 x i1> poison, <16 x i8> poison)
+  %t5 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr align(8) poison, <2 x i1> poison, <2 x i64> poison)
+  %t6 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr align(8) poison, <4 x i1> poison, <4 x i64> poison)
+  %t7 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr align(8) poison, <8 x i1> poison, <8 x i64> poison)
+  %t8 = call <16 x i64> @llvm.masked.expandload.v16i64.p0(ptr align(8) poison, <16 x i1> poison, <16 x i64> poison)
+  %t9 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr poison, <2 x i1> poison, <2 x i64> poison)
+  %t10 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr poison, <4 x i1> poison, <4 x i64> poison)
+  %t11 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr poison, <8 x i1> poison, <8 x i64> poison)
+  %t12 = call <16 x i64> @llvm.masked.expandload.v16i64.p0(ptr poison, <16 x i1> poison, <16 x i64> poison)
+  %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8.p0(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+  %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8.p0(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+  %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8.p0(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+  %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8.p0(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+  %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64.p0(ptr align(8) poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+  %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64.p0(ptr align(8) poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+  %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64.p0(ptr align(8) poison, <vscale x 8 x i1> poison, <vscale x 8 x i64> poison)
+  %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64.p0(ptr align(8) poison, <vscale x 16 x i1> poison, <vscale x 16 x i64> poison)
   ret void
 }
 
 define void @compress_store() {
 ; CHECK-LABEL: 'compress_store'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> poison, ptr poison, <2 x i1> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v4i8(<4 x i8> poison, ptr poison, <4 x i1> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> poison, ptr poison, <8 x i1> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> poison, ptr poison, <16 x i1> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr align 8 poison, <2 x i1> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr align 8 poison, <4 x i1> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr align 8 poison, <8 x i1> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr align 8 poison, <16 x i1> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr poison, <2 x i1> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr poison, <4 x i1> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr poison, <8 x i1> poison)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr poison, <16 x i1> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> poison, ptr poison, <vscale x 2 x i1> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> poison, ptr poison, <vscale x 4 x i1> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> poison, ptr poison, <vscale x 8 x i1> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> poison, ptr poison, <vscale x 16 x i1> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> poison, ptr align 8 poison, <vscale x 2 x i1> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> poison, ptr align 8 poison, <vscale x 4 x i1> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> poison, ptr align 8 poison, <vscale x 8 x i1> poison)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> poison, ptr align 8 poison, <vscale x 16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> poison, ptr poison, <2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v4i8.p0(<4 x i8> poison, ptr poison, <4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> poison, ptr poison, <8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> poison, ptr poison, <16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> poison, ptr align 8 poison, <2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> poison, ptr align 8 poison, <4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> poison, ptr align 8 poison, <8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v16i64.p0(<16 x i64> poison, ptr align 8 poison, <16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> poison, ptr poison, <2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> poison, ptr poison, <4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> poison, ptr poison, <8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: call void @llvm.masked.compressstore.v16i64.p0(<16 x i64> poison, ptr poison, <16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i8.p0(<vscale x 2 x i8> poison, ptr poison, <vscale x 2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i8.p0(<vscale x 4 x i8> poison, ptr poison, <vscale x 4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i8.p0(<vscale x 8 x i8> poison, ptr poison, <vscale x 8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i8.p0(<vscale x 16 x i8> poison, ptr poison, <vscale x 16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i64.p0(<vscale x 2 x i64> poison, ptr align 8 poison, <vscale x 2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i64.p0(<vscale x 4 x i64> poison, ptr align 8 poison, <vscale x 4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i64.p0(<vscale x 8 x i64> poison, ptr align 8 poison, <vscale x 8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i64.p0(<vscale x 16 x i64> poison, ptr align 8 poison, <vscale x 16 x i1> poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'compress_store'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> poison, ptr poison, <2 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i8(<4 x i8> poison, ptr poison, <4 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> poison, ptr poison, <8 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> poison, ptr poison, <16 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr align 8 poison, <2 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr align 8 poison, <4 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr align 8 poison, <8 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr align 8 poison, <16 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr poison, <2 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr poison, <4 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr poison, <8 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr poison, <16 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> poison, ptr poison, <vscale x 2 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> poison, ptr poison, <vscale x 4 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> poison, ptr poison, <vscale x 8 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> poison, ptr poison, <vscale x 16 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> poison, ptr align 8 poison, <vscale x 2 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> poison, ptr align 8 poison, <vscale x 4 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> poison, ptr align 8 poison, <vscale x 8 x i1> poison)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> poison, ptr align 8 poison, <vscale x 16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> poison, ptr poison, <2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i8.p0(<4 x i8> poison, ptr poison, <4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> poison, ptr poison, <8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> poison, ptr poison, <16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> poison, ptr align 8 poison, <2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> poison, ptr align 8 poison, <4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> poison, ptr align 8 poison, <8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i64.p0(<16 x i64> poison, ptr align 8 poison, <16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> poison, ptr poison, <2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> poison, ptr poison, <4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> poison, ptr poison, <8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i64.p0(<16 x i64> poison, ptr poison, <16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i8.p0(<vscale x 2 x i8> poison, ptr poison, <vscale x 2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i8.p0(<vscale x 4 x i8> poison, ptr poison, <vscale x 4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i8.p0(<vscale x 8 x i8> poison, ptr poison, <vscale x 8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i8.p0(<vscale x 16 x i8> poison, ptr poison, <vscale x 16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i64.p0(<vscale x 2 x i64> poison, ptr align 8 poison, <vscale x 2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i64.p0(<vscale x 4 x i64> poison, ptr align 8 poison, <vscale x 4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i64.p0(<vscale x 8 x i64> poison, ptr align 8 poison, <vscale x 8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i64.p0(<vscale x 16 x i64> poison, ptr align 8 poison, <vscale x 16 x i1> poison)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  call void @llvm.masked.compressstore.v2i8(<2 x i8> poison, ptr poison, <2 x i1> poison)
-  call void @llvm.masked.compressstore.v4i8(<4 x i8> poison, ptr poison, <4 x i1> poison)
-  call void @llvm.masked.compressstore.v8i8(<8 x i8> poison, ptr poison, <8 x i1> poison)
-  call void @llvm.masked.compressstore.v16i8(<16 x i8> poison, ptr poison, <16 x i1> poison)
-  call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr align(8) poison, <2 x i1> poison)
-  call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr align(8) poison, <4 x i1> poison)
-  call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr align(8) poison, <8 x i1> poison)
-  call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr align(8) poison, <16 x i1> poison)
-  call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr poison, <2 x i1> poison)
-  call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr poison, <4 x i1> poison)
-  call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr poison, <8 x i1> poison)
-  call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr poison, <16 x i1> poison)
-  call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> poison, ptr poison, <vscale x 2 x i1> poison)
-  call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> poison, ptr poison, <vscale x 4 x i1> poison)
-  call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> poison, ptr poison, <vscale x 8 x i1> poison)
-  call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> poison, ptr poison, <vscale x 16 x i1> poison)
-  call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> poison, ptr align(8) poison, <vscale x 2 x i1> poison)
-  call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> poison, ptr align(8) poison, <vscale x 4 x i1> poison)
-  call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> poison, ptr align(8) poison, <vscale x 8 x i1> poison)
-  call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> poison, ptr align(8) poison, <vscale x 16 x i1> poison)
+  call void @llvm.masked.compressstore.v2i8.p0(<2 x i8> poison, ptr poison, <2 x i1> poison)
+  call void @llvm.masked.compressstore.v4i8.p0(<4 x i8> poison, ptr poison, <4 x i1> poison)
+  call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> poison, ptr poison, <8 x i1> poison)
+  call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> poison, ptr poison, <16 x i1> poison)
+  call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> poison, ptr align(8) poison, <2 x i1> poison)
+  call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> poison, ptr align(8) poison, <4 x i1> poison)
+  call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> poison, ptr align(8) poison, <8 x i1> poison)
+  call void @llvm.masked.compressstore.v16i64.p0(<16 x i64> poison, ptr align(8) poison, <16 x i1> poison)
+  call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> poison, ptr poison, <2 x i1> poison)
+  call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> poison, ptr poison, <4 x i1> poison)
+  call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> poison, ptr poison, <8 x i1> poison)
+  call void @llvm.masked.compressstore.v16i64.p0(<16 x i64> poison, ptr poison, <16 x i1> poison)
+  call void @llvm.masked.compressstore.nxv2i8.p0(<vscale x 2 x i8> poison, ptr poison, <vscale x 2 x i1> poison)
+  call void @llvm.masked.compressstore.nxv4i8.p0(<vscale x 4 x i8> poison, ptr poison, <vscale x 4 x i1> poison)
+  call void @llvm.masked.compressstore.nxv8i8.p0(<vscale x 8 x i8> poison, ptr poison, <vscale x 8 x i1> poison)
+  call void @llvm.masked.compressstore.nxv16i8.p0(<vscale x 16 x i8> poison, ptr poison, <vscale x 16 x i1> poison)
+  call void @llvm.masked.compressstore.nxv2i64.p0(<vscale x 2 x i64> poison, ptr align(8) poison, <vscale x 2 x i1> poison)
+  call void @llvm.masked.compressstore.nxv4i64.p0(<vscale x 4 x i64> poison, ptr align(8) poison, <vscale x 4 x i1> poison)
+  call void @llvm.masked.compressstore.nxv8i64.p0(<vscale x 8 x i64> poison, ptr align(8) poison, <vscale x 8 x i1> poison)
+  call void @llvm.masked.compressstore.nxv16i64.p0(<vscale x 16 x i64> poison, ptr align(8) poison, <vscale x 16 x i1> poison)
   ret void
 }
 

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
index 58345ed8bb358..f835a8aedbbcc 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
@@ -1178,165 +1178,165 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 
 define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:125 SizeLat:77 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:63 SizeLat:39 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:32 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:69 SizeLat:45 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:35 SizeLat:23 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:18 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:141 SizeLat:93 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:71 SizeLat:47 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:36 SizeLat:24 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:18 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:258 SizeLat:162 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:129 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:65 SizeLat:41 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:33 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:576 SizeLat:384 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:288 SizeLat:192 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:144 SizeLat:96 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:72 SizeLat:48 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64.p0(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64.p0(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:125 SizeLat:77 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:63 SizeLat:39 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32.p0(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:32 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:69 SizeLat:45 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:35 SizeLat:23 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:18 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64.p0(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:141 SizeLat:93 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:71 SizeLat:47 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32.p0(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:36 SizeLat:24 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:18 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:258 SizeLat:162 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:129 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:65 SizeLat:41 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:33 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:576 SizeLat:384 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:288 SizeLat:192 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8.p0(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:144 SizeLat:96 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:72 SizeLat:48 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:53 SizeLat:29 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:27 SizeLat:15 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:109 SizeLat:61 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:55 SizeLat:31 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:28 SizeLat:16 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:113 SizeLat:65 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:226 SizeLat:130 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:113 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:452 SizeLat:260 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:226 SizeLat:130 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:113 SizeLat:65 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:53 SizeLat:29 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:27 SizeLat:15 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64.p0(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64.p0(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:109 SizeLat:61 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:55 SizeLat:31 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32.p0(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:28 SizeLat:16 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64.p0(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:113 SizeLat:65 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32.p0(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:226 SizeLat:130 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:113 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:452 SizeLat:260 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:226 SizeLat:130 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8.p0(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:113 SizeLat:65 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_expandload'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:115 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:58 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:119 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:60 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:236 SizeLat:140 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:118 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:470 SizeLat:278 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:235 SizeLat:139 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:117 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64.p0(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64.p0(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:115 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:58 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32.p0(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64.p0(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:119 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:60 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32.p0(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:236 SizeLat:140 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:118 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:470 SizeLat:278 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:235 SizeLat:139 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8.p0(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:117 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_expandload'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:115 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:58 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:119 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:60 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:235 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:118 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:468 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:234 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:117 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64.p0(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64.p0(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:115 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:58 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32.p0(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64.p0(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:119 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:60 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32.p0(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:235 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:118 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:468 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:234 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8.p0(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:117 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKL-LABEL: 'masked_expandload'
-; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:115 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:58 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:119 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:60 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:235 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:118 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:468 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:234 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:117 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64.p0(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64.p0(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:115 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:58 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32.p0(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64.p0(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:119 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:60 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32.p0(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:235 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:118 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:468 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:234 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8.p0(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:117 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:66 SizeLat:42 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:32 SizeLat:20 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:85 Lat:133 SizeLat:85 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:66 SizeLat:42 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:32 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:70 SizeLat:46 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:34 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:89 Lat:137 SizeLat:89 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:44 Lat:68 SizeLat:44 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:33 SizeLat:21 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:271 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:135 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:67 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:33 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:539 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:269 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:134 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:67 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:66 SizeLat:42 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:32 SizeLat:20 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64.p0(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64.p0(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:85 Lat:133 SizeLat:85 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:66 SizeLat:42 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32.p0(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:32 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:70 SizeLat:46 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:34 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64.p0(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:89 Lat:137 SizeLat:89 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:44 Lat:68 SizeLat:44 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32.p0(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:33 SizeLat:21 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:271 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:135 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:67 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:33 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:539 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:269 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8.p0(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:134 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:67 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
@@ -1374,165 +1374,165 @@ define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1>
 
 define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8f64.p0(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4f64.p0(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2f64.p0(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64.p0(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: call void @llvm.masked.compressstore.v16f32.p0(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.compressstore.v8f32.p0(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.compressstore.v4f32.p0(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2f32.p0(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64.p0(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.compressstore.v4i32.p0(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.compressstore.v2i32.p0(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.compressstore.v32i16.p0(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.compressstore.v16i16.p0(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: call void @llvm.masked.compressstore.v8i16.p0(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.compressstore.v4i16.p0(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: call void @llvm.masked.compressstore.v64i8.p0(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: call void @llvm.masked.compressstore.v32i8.p0(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: call void @llvm.masked.compressstore.v8f64.p0(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.compressstore.v4f64.p0(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64.p0(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64.p0(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.compressstore.v16f32.p0(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.compressstore.v8f32.p0(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.compressstore.v4f32.p0(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32.p0(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64.p0(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4i32.p0(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32.p0(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.compressstore.v32i16.p0(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.compressstore.v16i16.p0(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i16.p0(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4i16.p0(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: call void @llvm.masked.compressstore.v64i8.p0(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.compressstore.v32i8.p0(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8f64.p0(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f64.p0(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64.p0(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64.p0(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.compressstore.v16f32.p0(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.compressstore.v8f32.p0(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f32.p0(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32.p0(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64.p0(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i32.p0(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32.p0(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: call void @llvm.masked.compressstore.v32i16.p0(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.compressstore.v16i16.p0(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i16.p0(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i16.p0(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: call void @llvm.masked.compressstore.v64i8.p0(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.compressstore.v32i8.p0(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8f64.p0(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f64.p0(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64.p0(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64.p0(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.compressstore.v16f32.p0(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.compressstore.v8f32.p0(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f32.p0(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32.p0(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64.p0(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i32.p0(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32.p0(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.compressstore.v32i16.p0(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.compressstore.v16i16.p0(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i16.p0(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i16.p0(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.compressstore.v64i8.p0(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.compressstore.v32i8.p0(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8f64.p0(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f64.p0(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64.p0(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64.p0(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.compressstore.v16f32.p0(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.compressstore.v8f32.p0(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f32.p0(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32.p0(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64.p0(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i32.p0(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32.p0(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.compressstore.v32i16.p0(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.compressstore.v16i16.p0(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i16.p0(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i16.p0(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.compressstore.v64i8.p0(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.compressstore.v32i8.p0(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:85 Lat:85 SizeLat:85 for: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:46 SizeLat:46 for: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:89 Lat:89 SizeLat:89 for: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:44 Lat:44 SizeLat:44 for: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.compressstore.v8f64.p0(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.compressstore.v4f64.p0(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2f64.p0(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64.p0(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:85 Lat:85 SizeLat:85 for: call void @llvm.masked.compressstore.v16f32.p0(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.compressstore.v8f32.p0(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.compressstore.v4f32.p0(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2f32.p0(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:46 SizeLat:46 for: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64.p0(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:89 Lat:89 SizeLat:89 for: call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:44 Lat:44 SizeLat:44 for: call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.compressstore.v4i32.p0(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2i32.p0(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.compressstore.v32i16.p0(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.compressstore.v16i16.p0(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.compressstore.v8i16.p0(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.compressstore.v4i16.p0(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.compressstore.v64i8.p0(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.compressstore.v32i8.p0(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index d19b0b0e1fb1b..ed1b534fac8f8 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -1178,165 +1178,165 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 
 define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:125 SizeLat:77 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:63 SizeLat:39 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:32 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:69 SizeLat:45 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:35 SizeLat:23 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:18 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:141 SizeLat:93 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:71 SizeLat:47 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:36 SizeLat:24 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:18 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:258 SizeLat:162 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:129 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:65 SizeLat:41 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:33 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:576 SizeLat:384 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:288 SizeLat:192 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:144 SizeLat:96 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:72 SizeLat:48 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64.p0(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64.p0(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:125 SizeLat:77 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:63 SizeLat:39 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32.p0(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:32 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:69 SizeLat:45 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:35 SizeLat:23 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:18 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64.p0(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:141 SizeLat:93 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:71 SizeLat:47 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32.p0(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:36 SizeLat:24 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:18 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:258 SizeLat:162 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:129 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:65 SizeLat:41 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:33 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:576 SizeLat:384 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:288 SizeLat:192 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8.p0(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:144 SizeLat:96 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:72 SizeLat:48 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:53 SizeLat:29 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:27 SizeLat:15 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:109 SizeLat:61 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:55 SizeLat:31 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:28 SizeLat:16 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:113 SizeLat:65 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:226 SizeLat:130 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:113 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:452 SizeLat:260 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:226 SizeLat:130 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:113 SizeLat:65 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:53 SizeLat:29 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:27 SizeLat:15 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64.p0(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64.p0(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:109 SizeLat:61 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:55 SizeLat:31 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32.p0(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:28 SizeLat:16 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64.p0(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:113 SizeLat:65 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32.p0(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:226 SizeLat:130 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:113 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:452 SizeLat:260 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:226 SizeLat:130 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8.p0(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:113 SizeLat:65 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_expandload'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:115 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:58 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:119 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:60 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:236 SizeLat:140 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:118 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:470 SizeLat:278 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:235 SizeLat:139 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:117 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64.p0(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64.p0(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:115 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:58 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32.p0(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64.p0(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:119 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:60 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32.p0(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:236 SizeLat:140 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:118 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:470 SizeLat:278 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:235 SizeLat:139 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8.p0(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:117 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_expandload'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:115 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:58 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:119 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:60 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:235 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:118 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:468 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:234 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:117 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64.p0(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64.p0(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:115 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:58 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32.p0(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64.p0(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:119 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:60 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32.p0(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:235 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:118 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:468 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:234 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8.p0(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:117 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKL-LABEL: 'masked_expandload'
-; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:115 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:58 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:119 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:60 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:235 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:118 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:468 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:234 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:117 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:57 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64.p0(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64.p0(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:115 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:58 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32.p0(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:29 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:61 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:31 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64.p0(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:119 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:60 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32.p0(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:235 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:118 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:30 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:468 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:234 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8.p0(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:117 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:59 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:66 SizeLat:42 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:32 SizeLat:20 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:85 Lat:133 SizeLat:85 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:66 SizeLat:42 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:32 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:70 SizeLat:46 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:34 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:89 Lat:137 SizeLat:89 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:44 Lat:68 SizeLat:44 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:33 SizeLat:21 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:271 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:135 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:67 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:33 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:539 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:269 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:134 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:67 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:66 SizeLat:42 for: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:32 SizeLat:20 for: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64.p0(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64.p0(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64.p0(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:85 Lat:133 SizeLat:85 for: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:66 SizeLat:42 for: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32.p0(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:32 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32.p0(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:15 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32.p0(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:70 SizeLat:46 for: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:34 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64.p0(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:6 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64.p0(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:89 Lat:137 SizeLat:89 for: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:44 Lat:68 SizeLat:44 for: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32.p0(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:33 SizeLat:21 for: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:16 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32.p0(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:271 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16.p0(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:135 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16.p0(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:67 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16.p0(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:33 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16.p0(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:539 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8.p0(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:269 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8.p0(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:134 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8.p0(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:67 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8.p0(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
@@ -1374,165 +1374,165 @@ define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1>
 
 define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8f64.p0(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4f64.p0(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2f64.p0(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64.p0(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: call void @llvm.masked.compressstore.v16f32.p0(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.compressstore.v8f32.p0(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.compressstore.v4f32.p0(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2f32.p0(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64.p0(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.compressstore.v4i32.p0(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.compressstore.v2i32.p0(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.compressstore.v32i16.p0(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.compressstore.v16i16.p0(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: call void @llvm.masked.compressstore.v8i16.p0(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.compressstore.v4i16.p0(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: call void @llvm.masked.compressstore.v64i8.p0(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: call void @llvm.masked.compressstore.v32i8.p0(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: call void @llvm.masked.compressstore.v8f64.p0(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.compressstore.v4f64.p0(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64.p0(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64.p0(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.compressstore.v16f32.p0(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.compressstore.v8f32.p0(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.compressstore.v4f32.p0(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32.p0(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64.p0(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4i32.p0(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32.p0(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.compressstore.v32i16.p0(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.compressstore.v16i16.p0(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i16.p0(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4i16.p0(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: call void @llvm.masked.compressstore.v64i8.p0(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.compressstore.v32i8.p0(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8f64.p0(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f64.p0(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64.p0(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64.p0(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.compressstore.v16f32.p0(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.compressstore.v8f32.p0(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f32.p0(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32.p0(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64.p0(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i32.p0(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32.p0(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: call void @llvm.masked.compressstore.v32i16.p0(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.compressstore.v16i16.p0(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i16.p0(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i16.p0(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: call void @llvm.masked.compressstore.v64i8.p0(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.compressstore.v32i8.p0(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8f64.p0(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f64.p0(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64.p0(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64.p0(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.compressstore.v16f32.p0(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.compressstore.v8f32.p0(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f32.p0(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32.p0(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64.p0(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i32.p0(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32.p0(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.compressstore.v32i16.p0(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.compressstore.v16i16.p0(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i16.p0(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i16.p0(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.compressstore.v64i8.p0(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.compressstore.v32i8.p0(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.compressstore.v8f64.p0(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f64.p0(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f64.p0(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64.p0(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.compressstore.v16f32.p0(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.compressstore.v8f32.p0(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.compressstore.v4f32.p0(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.compressstore.v2f32.p0(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64.p0(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i32.p0(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2i32.p0(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.compressstore.v32i16.p0(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.compressstore.v16i16.p0(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i16.p0(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.compressstore.v4i16.p0(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.compressstore.v64i8.p0(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.compressstore.v32i8.p0(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:85 Lat:85 SizeLat:85 for: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:46 SizeLat:46 for: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:89 Lat:89 SizeLat:89 for: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:44 Lat:44 SizeLat:44 for: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.compressstore.v8f64.p0(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.compressstore.v4f64.p0(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2f64.p0(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1f64.p0(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:85 Lat:85 SizeLat:85 for: call void @llvm.masked.compressstore.v16f32.p0(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.compressstore.v8f32.p0(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.compressstore.v4f32.p0(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.compressstore.v2f32.p0(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:46 SizeLat:46 for: call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.compressstore.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.compressstore.v1i64.p0(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:89 Lat:89 SizeLat:89 for: call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:44 Lat:44 SizeLat:44 for: call void @llvm.masked.compressstore.v8i32.p0(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.compressstore.v4i32.p0(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.compressstore.v2i32.p0(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.compressstore.v32i16.p0(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.compressstore.v16i16.p0(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.compressstore.v8i16.p0(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.compressstore.v4i16.p0(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.compressstore.v64i8.p0(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.compressstore.v32i8.p0(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.compressstore.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.compressstore.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)

diff  --git a/llvm/test/Assembler/masked-load-store-intrinsics-attributes.ll b/llvm/test/Assembler/masked-load-store-intrinsics-attributes.ll
index e24db9959425d..decfc9760e0b4 100644
--- a/llvm/test/Assembler/masked-load-store-intrinsics-attributes.ll
+++ b/llvm/test/Assembler/masked-load-store-intrinsics-attributes.ll
@@ -10,11 +10,11 @@ declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr, <vscale x 2 x i1>,
 ; CHECK: declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr captures(none), <vscale x 2 x i1>) [[ARGMEMONLY_NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_WILLRETURN_WRITEONLY:#[0-9]+]]
 declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr, <vscale x 2 x i1>)
 
-; CHECK: declare <16 x float> @llvm.masked.expandload.v16f32(ptr captures(none), <16 x i1>, <16 x float>) [[NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_READONLY_WILLRETURN:#[0-9]+]]
-declare <16 x float> @llvm.masked.expandload.v16f32 (ptr, <16 x i1>, <16 x float>)
+; CHECK: declare <16 x float> @llvm.masked.expandload.v16f32.p0(ptr captures(none), <16 x i1>, <16 x float>) [[NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_READONLY_WILLRETURN:#[0-9]+]]
+declare <16 x float> @llvm.masked.expandload.v16f32.p0(ptr, <16 x i1>, <16 x float>)
 
-; CHECK: declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr captures(none), <8 x i1>) [[ARGMEMONLY_NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_WILLRETURN_WRITEONLY:#[0-9]+]]
-declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8  x i1>)
+; CHECK: declare void @llvm.masked.compressstore.v8i32.p0(<8 x i32>, ptr captures(none), <8 x i1>) [[ARGMEMONLY_NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_WILLRETURN_WRITEONLY:#[0-9]+]]
+declare void @llvm.masked.compressstore.v8i32.p0(<8 x i32>, ptr, <8  x i1>)
 
 ; CHECK: attributes [[ARGMEMONLY_NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_READONLY_WILLRETURN]] = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
 ; CHECK: attributes [[ARGMEMONLY_NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_WILLRETURN_WRITEONLY]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }

diff  --git a/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll b/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll
index 0c44b300bc00d..e9dd61036e9ce 100644
--- a/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll
+++ b/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll
@@ -52,7 +52,7 @@ define <4 x i32> @expandload(ptr nocapture readonly %a0) !dbg !43 {
 ; CHECK-LABEL: define <4 x i32> @expandload(
 ; CHECK-SAME: ptr readonly captures(none) [[A0:%.*]]) !dbg [[DBG43:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.expandload.v4i32(ptr [[A0]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef), !dbg [[DBG49:![0-9]+]], !tbaa [[INT_TBAA50:![0-9]+]]
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.expandload.v4i32.p0(ptr [[A0]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef), !dbg [[DBG49:![0-9]+]], !tbaa [[INT_TBAA50:![0-9]+]]
 ; CHECK-NEXT:    ret <4 x i32> [[V0]], !dbg [[DBG52:![0-9]+]]
 ;
 entry:
@@ -64,7 +64,7 @@ define void @compressstore(<4 x i32> %a0, ptr nocapture %a1) !dbg !53 {
 ; CHECK-LABEL: define void @compressstore(
 ; CHECK-SAME: <4 x i32> [[A0:%.*]], ptr captures(none) [[A1:%.*]]) !dbg [[DBG53:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v4i32(<4 x i32> [[A0]], ptr [[A1]], <4 x i1> <i1 false, i1 false, i1 true, i1 true>), !dbg [[DBG59:![0-9]+]], !tbaa [[INT_TBAA50]]
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v4i32.p0(<4 x i32> [[A0]], ptr [[A1]], <4 x i1> <i1 false, i1 false, i1 true, i1 true>), !dbg [[DBG59:![0-9]+]], !tbaa [[INT_TBAA50]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG60:![0-9]+]]
 ;
 entry:

diff  --git a/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
index 60a4f2225035f..1ab36233bbb6a 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
@@ -436,8 +436,8 @@ define void @scalable.scatter.nxv4f32(<vscale x 4 x float> %val, <vscale x 4 x p
   ret void
 }
 
-declare <vscale x 4 x float> @llvm.masked.expandload.nxv4f32(ptr, <vscale x 4 x i1>, <vscale x 4 x float>)
-declare void @llvm.masked.compressstore.nxv4f32(<vscale x 4 x float>, ptr, <vscale x 4 x i1>)
+declare <vscale x 4 x float> @llvm.masked.expandload.nxv4f32.p0(ptr, <vscale x 4 x i1>, <vscale x 4 x float>)
+declare void @llvm.masked.compressstore.nxv4f32.p0(<vscale x 4 x float>, ptr, <vscale x 4 x i1>)
 
 define <vscale x 4 x float> @scalable.expandload.nxv4f32(ptr align 4 %p, <vscale x 4 x i1> %mask) sanitize_address {
 ; CHECK-LABEL: @scalable.expandload.nxv4f32(
@@ -466,14 +466,14 @@ define <vscale x 4 x float> @scalable.expandload.nxv4f32(ptr align 4 %p, <vscale
 ; CHECK:       .split.split:
 ; CHECK-NEXT:    br label [[TMP13]]
 ; CHECK:       13:
-; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32(ptr [[P]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> undef)
+; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32.p0(ptr [[P]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[RES]]
 ;
 ; DISABLED-LABEL: @scalable.expandload.nxv4f32(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32(ptr [[P:%.*]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> undef)
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32.p0(ptr [[P:%.*]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> undef)
 ; DISABLED-NEXT:    ret <vscale x 4 x float> [[RES]]
 ;
-  %res = tail call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32(ptr %p, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+  %res = tail call <vscale x 4 x float> @llvm.masked.expandload.nxv4f32.p0(ptr %p, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
   ret <vscale x 4 x float> %res
 }
 
@@ -504,13 +504,13 @@ define void @scalable.compressstore.nxv4f32(ptr align 4 %p, <vscale x 4 x float>
 ; CHECK:       .split.split:
 ; CHECK-NEXT:    br label [[TMP13]]
 ; CHECK:       13:
-; CHECK-NEXT:    tail call void @llvm.masked.compressstore.nxv4f32(<vscale x 4 x float> [[ARG:%.*]], ptr [[P]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.compressstore.nxv4f32.p0(<vscale x 4 x float> [[ARG:%.*]], ptr [[P]], <vscale x 4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @scalable.compressstore.nxv4f32(
-; DISABLED-NEXT:    tail call void @llvm.masked.compressstore.nxv4f32(<vscale x 4 x float> [[ARG:%.*]], ptr [[P:%.*]], <vscale x 4 x i1> [[MASK:%.*]])
+; DISABLED-NEXT:    tail call void @llvm.masked.compressstore.nxv4f32.p0(<vscale x 4 x float> [[ARG:%.*]], ptr [[P:%.*]], <vscale x 4 x i1> [[MASK:%.*]])
 ; DISABLED-NEXT:    ret void
 ;
-  tail call void @llvm.masked.compressstore.nxv4f32(<vscale x 4 x float> %arg, ptr %p, <vscale x 4 x i1> %mask)
+  tail call void @llvm.masked.compressstore.nxv4f32.p0(<vscale x 4 x float> %arg, ptr %p, <vscale x 4 x i1> %mask)
   ret void
 }

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
index a870fdd945b6f..4711156b34f2e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
@@ -16089,7 +16089,7 @@ define void @test_mask_compress_store_pd_512(ptr %addr, <8 x double> %data, i8 %
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP3]], ptr [[TMP8]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> [[TMP3]], ptr [[TMP8]], <8 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0
@@ -16099,7 +16099,7 @@ define void @test_mask_compress_store_pd_512(ptr %addr, <8 x double> %data, i8 %
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       11:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8f64(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 %mask)
@@ -16117,14 +16117,14 @@ define void @test_compress_store_pd_512(ptr %addr, <8 x double> %data)  #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP2]], ptr [[TMP5]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP5]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8f64(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 -1)
@@ -16143,7 +16143,7 @@ define void @test_mask_compress_store_ps_512(ptr %addr, <16 x float> %data, i16
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP3]], ptr [[TMP8]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> [[TMP3]], ptr [[TMP8]], <16 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0
@@ -16153,7 +16153,7 @@ define void @test_mask_compress_store_ps_512(ptr %addr, <16 x float> %data, i16
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       11:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16f32(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 %mask)
@@ -16171,14 +16171,14 @@ define void @test_compress_store_ps_512(ptr %addr, <16 x float> %data)  #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP2]], ptr [[TMP5]], <16 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP5]], <16 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16f32(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 -1)
@@ -16197,7 +16197,7 @@ define void @test_mask_compress_store_q_512(ptr %addr, <8 x i64> %data, i8 %mask
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP3]], ptr [[TMP8]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> [[TMP3]], ptr [[TMP8]], <8 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0
@@ -16207,7 +16207,7 @@ define void @test_mask_compress_store_q_512(ptr %addr, <8 x i64> %data, i8 %mask
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       11:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 %mask)
@@ -16225,14 +16225,14 @@ define void @test_compress_store_q_512(ptr %addr, <8 x i64> %data)  #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP2]], ptr [[TMP5]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP5]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64.p0(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 -1)
@@ -16251,7 +16251,7 @@ define void @test_mask_compress_store_d_512(ptr %addr, <16 x i32> %data, i16 %ma
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP3]], ptr [[TMP8]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> [[TMP3]], ptr [[TMP8]], <16 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0
@@ -16261,7 +16261,7 @@ define void @test_mask_compress_store_d_512(ptr %addr, <16 x i32> %data, i16 %ma
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       11:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 %mask)
@@ -16279,14 +16279,14 @@ define void @test_compress_store_d_512(ptr %addr, <16 x i32> %data)  #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP2]], ptr [[TMP5]], <16 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP5]], <16 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 -1)
@@ -16305,7 +16305,7 @@ define <8 x double> @test_mask_expand_load_pd_512(ptr %addr, <8 x double> %data,
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP8]], <8 x i1> [[TMP5]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr [[TMP8]], <8 x i1> [[TMP5]], <8 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0
@@ -16315,7 +16315,7 @@ define <8 x double> @test_mask_expand_load_pd_512(ptr %addr, <8 x double> %data,
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       11:
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x double> [[DATA:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x double> [[DATA:%.*]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP12]]
 ;
@@ -16334,7 +16334,7 @@ define <8 x double> @test_maskz_expand_load_pd_512(ptr %addr, i8 %mask)  #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP7]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr [[TMP7]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP8]], 0
@@ -16344,7 +16344,7 @@ define <8 x double> @test_maskz_expand_load_pd_512(ptr %addr, i8 %mask)  #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x double> zeroinitializer)
 ; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP11]]
 ;
@@ -16363,14 +16363,14 @@ define <8 x double> @test_expand_load_pd_512(ptr %addr, <8 x double> %data)  #0
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> splat (i1 true), <8 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr [[TMP5]], <8 x i1> splat (i1 true), <8 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x double> [[DATA:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x double> [[DATA:%.*]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP8]]
 ;
@@ -16387,14 +16387,14 @@ define <8 x double> @test_zero_mask_expand_load_pd_512(ptr %addr, <8 x double> %
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> zeroinitializer, <8 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr [[TMP5]], <8 x i1> zeroinitializer, <8 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> zeroinitializer, <8 x double> [[DATA:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64.p0(ptr [[ADDR]], <8 x i1> zeroinitializer, <8 x double> [[DATA:%.*]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP8]]
 ;
@@ -16414,7 +16414,7 @@ define <16 x float> @test_mask_expand_load_ps_512(ptr %addr, <16 x float> %data,
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP8]], <16 x i1> [[TMP5]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr [[TMP8]], <16 x i1> [[TMP5]], <16 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0
@@ -16424,7 +16424,7 @@ define <16 x float> @test_mask_expand_load_ps_512(ptr %addr, <16 x float> %data,
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       11:
-; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x float> [[DATA:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x float> [[DATA:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP12]]
 ;
@@ -16443,7 +16443,7 @@ define <16 x float> @test_maskz_expand_load_ps_512(ptr %addr, i16 %mask)  #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP7]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr [[TMP7]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP8]], 0
@@ -16453,7 +16453,7 @@ define <16 x float> @test_maskz_expand_load_ps_512(ptr %addr, i16 %mask)  #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x float> zeroinitializer)
 ; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP11]]
 ;
@@ -16472,14 +16472,14 @@ define <16 x float> @test_expand_load_ps_512(ptr %addr, <16 x float> %data)  #0
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP5]], <16 x i1> splat (i1 true), <16 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr [[TMP5]], <16 x i1> splat (i1 true), <16 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x float> [[DATA:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x float> [[DATA:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP8]]
 ;
@@ -16499,7 +16499,7 @@ define <8 x i64> @test_mask_expand_load_q_512(ptr %addr, <8 x i64> %data, i8 %ma
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP8]], <8 x i1> [[TMP5]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr [[TMP8]], <8 x i1> [[TMP5]], <8 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0
@@ -16509,7 +16509,7 @@ define <8 x i64> @test_mask_expand_load_q_512(ptr %addr, <8 x i64> %data, i8 %ma
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       11:
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x i64> [[DATA:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x i64> [[DATA:%.*]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
 ;
@@ -16528,7 +16528,7 @@ define <8 x i64> @test_maskz_expand_load_q_512(ptr %addr, i8 %mask)  #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP7]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr [[TMP7]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP8]], 0
@@ -16538,7 +16538,7 @@ define <8 x i64> @test_maskz_expand_load_q_512(ptr %addr, i8 %mask)  #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
 ;
@@ -16557,14 +16557,14 @@ define <8 x i64> @test_expand_load_q_512(ptr %addr, <8 x i64> %data)  #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> splat (i1 true), <8 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr [[TMP5]], <8 x i1> splat (i1 true), <8 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x i64> [[DATA:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64.p0(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x i64> [[DATA:%.*]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP8]]
 ;
@@ -16584,7 +16584,7 @@ define <16 x i32> @test_mask_expand_load_d_512(ptr %addr, <16 x i32> %data, i16
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP8]], <16 x i1> [[TMP5]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr [[TMP8]], <16 x i1> [[TMP5]], <16 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0
@@ -16594,7 +16594,7 @@ define <16 x i32> @test_mask_expand_load_d_512(ptr %addr, <16 x i32> %data, i16
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       11:
-; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x i32> [[DATA:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x i32> [[DATA:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
 ;
@@ -16613,7 +16613,7 @@ define <16 x i32> @test_maskz_expand_load_d_512(ptr %addr, i16 %mask)  #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP7]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr [[TMP7]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP8]], 0
@@ -16623,7 +16623,7 @@ define <16 x i32> @test_maskz_expand_load_d_512(ptr %addr, i16 %mask)  #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
 ;
@@ -16642,14 +16642,14 @@ define <16 x i32> @test_expand_load_d_512(ptr %addr, <16 x i32> %data)  #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP5]], <16 x i1> splat (i1 true), <16 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr [[TMP5]], <16 x i1> splat (i1 true), <16 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR9]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x i32> [[DATA:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x i32> [[DATA:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
 ;

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll b/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll
index 77b48360a64d8..beebb944e49e8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll
@@ -9,12 +9,12 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>)
-declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
-declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
-declare void @llvm.masked.scatter.v8i32.v8p0  (<8 x i32>, <8 x ptr>, i32, <8 x i1>)
-declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
-declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, <4 x i1>)
+declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, <4 x i1>, <4 x double>)
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, <16 x i1>, <16 x float>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, <8 x i1>)
+declare <16 x float> @llvm.masked.expandload.v16f32.p0(ptr, <16 x i1>, <16 x float>)
+declare void @llvm.masked.compressstore.v16f32.p0(<16 x float>, ptr, <16 x i1>)
 
 define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ; CHECK-LABEL: @Store(
@@ -81,7 +81,7 @@ define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ; ORIGINS-NEXT:    ret void
 ;
 entry:
-  tail call void @llvm.masked.store.v4i64.p0(<4 x i64> %v, ptr %p, i32 1, <4 x i1> %mask)
+  tail call void @llvm.masked.store.v4i64.p0(<4 x i64> %v, ptr align 1 %p, <4 x i1> %mask)
   ret void
 }
 
@@ -146,7 +146,7 @@ define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memo
 ; ORIGINS-NEXT:    ret <4 x double> [[X]]
 ;
 entry:
-  %x = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %p, i32 1, <4 x i1> %mask, <4 x double> %v)
+  %x = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 %p, <4 x i1> %mask, <4 x double> %v)
   ret <4 x double> %x
 }
 
@@ -200,7 +200,7 @@ define void @StoreNoSanitize(ptr %p, <4 x i64> %v, <4 x i1> %mask) {
 ; ORIGINS-NEXT:    ret void
 ;
 entry:
-  tail call void @llvm.masked.store.v4i64.p0(<4 x i64> %v, ptr %p, i32 1, <4 x i1> %mask)
+  tail call void @llvm.masked.store.v4i64.p0(<4 x i64> %v, ptr align 1 %p, <4 x i1> %mask)
   ret void
 }
 
@@ -228,7 +228,7 @@ define <4 x double> @LoadNoSanitize(ptr %p, <4 x double> %v, <4 x i1> %mask) {
 ; ORIGINS-NEXT:    ret <4 x double> [[X]]
 ;
 entry:
-  %x = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %p, i32 1, <4 x i1> %mask, <4 x double> %v)
+  %x = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 %p, <4 x i1> %mask, <4 x double> %v)
   ret <4 x double> %x
 }
 
@@ -284,7 +284,7 @@ define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %pas
 ; ORIGINS-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
 ; ORIGINS-NEXT:    ret <16 x float> [[RET]]
 ;
-  %ret = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %passthru)
+  %ret = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x float> %passthru)
   ret <16 x float> %ret
 }
 
@@ -309,7 +309,7 @@ define <16 x float> @GatherNoSanitize(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x f
 ; ORIGINS-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
 ; ORIGINS-NEXT:    ret <16 x float> [[RET]]
 ;
-  %ret = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %passthru)
+  %ret = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %ptrs, <16 x i1> %mask, <16 x float> %passthru)
   ret <16 x float> %ret
 }
 
@@ -361,7 +361,7 @@ define void @Scatter(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask) sanitize
 ; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; ORIGINS-NEXT:    ret void
 ;
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %value, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %value, <8 x ptr> align 8 %ptrs, <8 x i1> %mask)
   ret void
 }
 
@@ -396,7 +396,7 @@ define void @ScatterNoSanitize(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask
 ; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; ORIGINS-NEXT:    ret void
 ;
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %value, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %value, <8 x ptr> align 8 %ptrs, <8 x i1> %mask)
   ret void
 }
 
@@ -408,8 +408,8 @@ define <16 x float> @ExpandLoad(ptr %ptr, <16 x i1> %mask, <16 x float> %passthr
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP4]], <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP1]])
-; CHECK-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[PTR]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr [[TMP4]], <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP1]])
+; CHECK-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr [[PTR]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RET]]
 ;
@@ -421,7 +421,7 @@ define <16 x float> @ExpandLoad(ptr %ptr, <16 x i1> %mask, <16 x float> %passthr
 ; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; ADDR-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
 ; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; ADDR-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP6]], <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP3]])
+; ADDR-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr [[TMP6]], <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP3]])
 ; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; ADDR-NEXT:    [[TMP7:%.*]] = bitcast <16 x i1> [[TMP2]] to i16
 ; ADDR-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP7]], 0
@@ -431,7 +431,7 @@ define <16 x float> @ExpandLoad(ptr %ptr, <16 x i1> %mask, <16 x float> %passthr
 ; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; ADDR-NEXT:    unreachable
 ; ADDR:       9:
-; ADDR-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[PTR]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
+; ADDR-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr [[PTR]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
 ; ADDR-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; ADDR-NEXT:    ret <16 x float> [[RET]]
 ;
@@ -445,37 +445,37 @@ define <16 x float> @ExpandLoad(ptr %ptr, <16 x i1> %mask, <16 x float> %passthr
 ; ORIGINS-NEXT:    [[TMP6:%.*]] = add i64 [[TMP4]], 17592186044416
 ; ORIGINS-NEXT:    [[TMP7:%.*]] = and i64 [[TMP6]], -4
 ; ORIGINS-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; ORIGINS-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP5]], <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP1]])
-; ORIGINS-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[PTR]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
+; ORIGINS-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32.p0(ptr [[TMP5]], <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP1]])
+; ORIGINS-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr [[PTR]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
 ; ORIGINS-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
 ; ORIGINS-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
 ; ORIGINS-NEXT:    ret <16 x float> [[RET]]
 ;
-  %ret = call <16 x float> @llvm.masked.expandload.v16f32(ptr %ptr, <16 x i1> %mask, <16 x float> %passthru)
+  %ret = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr %ptr, <16 x i1> %mask, <16 x float> %passthru)
   ret <16 x float> %ret
 }
 
 define <16 x float> @ExpandLoadNoSanitize(ptr %ptr, <16 x i1> %mask, <16 x float> %passthru) {
 ; CHECK-LABEL: @ExpandLoadNoSanitize(
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[PTR:%.*]], <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr [[PTR:%.*]], <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RET]]
 ;
 ; ADDR-LABEL: @ExpandLoadNoSanitize(
 ; ADDR-NEXT:    call void @llvm.donothing()
-; ADDR-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[PTR:%.*]], <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
+; ADDR-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr [[PTR:%.*]], <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
 ; ADDR-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; ADDR-NEXT:    ret <16 x float> [[RET]]
 ;
 ; ORIGINS-LABEL: @ExpandLoadNoSanitize(
 ; ORIGINS-NEXT:    call void @llvm.donothing()
-; ORIGINS-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[PTR:%.*]], <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
+; ORIGINS-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr [[PTR:%.*]], <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
 ; ORIGINS-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; ORIGINS-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
 ; ORIGINS-NEXT:    ret <16 x float> [[RET]]
 ;
-  %ret = call <16 x float> @llvm.masked.expandload.v16f32(ptr %ptr, <16 x i1> %mask, <16 x float> %passthru)
+  %ret = call <16 x float> @llvm.masked.expandload.v16f32.p0(ptr %ptr, <16 x i1> %mask, <16 x float> %passthru)
   ret <16 x float> %ret
 }
 
@@ -487,8 +487,8 @@ define void @CompressStore(<16 x float> %value, ptr %ptr, <16 x i1> %mask) sanit
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP1]], ptr [[TMP4]], <16 x i1> [[MASK:%.*]])
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16f32(<16 x float> [[VALUE:%.*]], ptr [[PTR]], <16 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> [[TMP1]], ptr [[TMP4]], <16 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16f32.p0(<16 x float> [[VALUE:%.*]], ptr [[PTR]], <16 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @CompressStore(
@@ -499,7 +499,7 @@ define void @CompressStore(<16 x float> %value, ptr %ptr, <16 x i1> %mask) sanit
 ; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; ADDR-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
 ; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; ADDR-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP3]], ptr [[TMP6]], <16 x i1> [[MASK:%.*]])
+; ADDR-NEXT:    call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> [[TMP3]], ptr [[TMP6]], <16 x i1> [[MASK:%.*]])
 ; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; ADDR-NEXT:    [[TMP7:%.*]] = bitcast <16 x i1> [[TMP2]] to i16
 ; ADDR-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP7]], 0
@@ -509,7 +509,7 @@ define void @CompressStore(<16 x float> %value, ptr %ptr, <16 x i1> %mask) sanit
 ; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; ADDR-NEXT:    unreachable
 ; ADDR:       9:
-; ADDR-NEXT:    call void @llvm.masked.compressstore.v16f32(<16 x float> [[VALUE:%.*]], ptr [[PTR]], <16 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.compressstore.v16f32.p0(<16 x float> [[VALUE:%.*]], ptr [[PTR]], <16 x i1> [[MASK]])
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @CompressStore(
@@ -522,11 +522,11 @@ define void @CompressStore(<16 x float> %value, ptr %ptr, <16 x i1> %mask) sanit
 ; ORIGINS-NEXT:    [[TMP6:%.*]] = add i64 [[TMP4]], 17592186044416
 ; ORIGINS-NEXT:    [[TMP7:%.*]] = and i64 [[TMP6]], -4
 ; ORIGINS-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; ORIGINS-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP1]], ptr [[TMP5]], <16 x i1> [[MASK:%.*]])
-; ORIGINS-NEXT:    call void @llvm.masked.compressstore.v16f32(<16 x float> [[VALUE:%.*]], ptr [[PTR]], <16 x i1> [[MASK]])
+; ORIGINS-NEXT:    call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> [[TMP1]], ptr [[TMP5]], <16 x i1> [[MASK:%.*]])
+; ORIGINS-NEXT:    call void @llvm.masked.compressstore.v16f32.p0(<16 x float> [[VALUE:%.*]], ptr [[PTR]], <16 x i1> [[MASK]])
 ; ORIGINS-NEXT:    ret void
 ;
-  call void @llvm.masked.compressstore.v16f32(<16 x float> %value, ptr %ptr, <16 x i1> %mask)
+  call void @llvm.masked.compressstore.v16f32.p0(<16 x float> %value, ptr %ptr, <16 x i1> %mask)
   ret void
 }
 
@@ -536,8 +536,8 @@ define void @CompressStoreNoSanitize(<16 x float> %value, ptr %ptr, <16 x i1> %m
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> zeroinitializer, ptr [[TMP3]], <16 x i1> [[MASK:%.*]])
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16f32(<16 x float> [[VALUE:%.*]], ptr [[PTR]], <16 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> zeroinitializer, ptr [[TMP3]], <16 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16f32.p0(<16 x float> [[VALUE:%.*]], ptr [[PTR]], <16 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @CompressStoreNoSanitize(
@@ -545,8 +545,8 @@ define void @CompressStoreNoSanitize(<16 x float> %value, ptr %ptr, <16 x i1> %m
 ; ADDR-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; ADDR-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
 ; ADDR-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; ADDR-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> zeroinitializer, ptr [[TMP3]], <16 x i1> [[MASK:%.*]])
-; ADDR-NEXT:    call void @llvm.masked.compressstore.v16f32(<16 x float> [[VALUE:%.*]], ptr [[PTR]], <16 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> zeroinitializer, ptr [[TMP3]], <16 x i1> [[MASK:%.*]])
+; ADDR-NEXT:    call void @llvm.masked.compressstore.v16f32.p0(<16 x float> [[VALUE:%.*]], ptr [[PTR]], <16 x i1> [[MASK]])
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @CompressStoreNoSanitize(
@@ -557,10 +557,10 @@ define void @CompressStoreNoSanitize(<16 x float> %value, ptr %ptr, <16 x i1> %m
 ; ORIGINS-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
 ; ORIGINS-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
 ; ORIGINS-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; ORIGINS-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> zeroinitializer, ptr [[TMP3]], <16 x i1> [[MASK:%.*]])
-; ORIGINS-NEXT:    call void @llvm.masked.compressstore.v16f32(<16 x float> [[VALUE:%.*]], ptr [[PTR]], <16 x i1> [[MASK]])
+; ORIGINS-NEXT:    call void @llvm.masked.compressstore.v16i32.p0(<16 x i32> zeroinitializer, ptr [[TMP3]], <16 x i1> [[MASK:%.*]])
+; ORIGINS-NEXT:    call void @llvm.masked.compressstore.v16f32.p0(<16 x float> [[VALUE:%.*]], ptr [[PTR]], <16 x i1> [[MASK]])
 ; ORIGINS-NEXT:    ret void
 ;
-  call void @llvm.masked.compressstore.v16f32(<16 x float> %value, ptr %ptr, <16 x i1> %mask)
+  call void @llvm.masked.compressstore.v16f32.p0(<16 x float> %value, ptr %ptr, <16 x i1> %mask)
   ret void
 }

diff  --git a/llvm/test/Other/force-opaque-ptrs.ll b/llvm/test/Other/force-opaque-ptrs.ll
index 2850004e04c88..1a7f4a41ee66d 100644
--- a/llvm/test/Other/force-opaque-ptrs.ll
+++ b/llvm/test/Other/force-opaque-ptrs.ll
@@ -74,13 +74,13 @@ define void @remangle_intrinsic() {
 ; CHECK-NEXT:    [[A:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.stacksave.p0()
 ; CHECK-NEXT:    call void @llvm.stackprotector(ptr null, ptr [[A]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.masked.expandload.v2i64(ptr null, <2 x i1> zeroinitializer, <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr null, <2 x i1> zeroinitializer, <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    ret void
 ;
   %a = alloca ptr
   call ptr @llvm.stacksave()
   call void @llvm.stackprotector(ptr null, ptr %a)
-  call <2 x i64> @llvm.masked.expandload.v2i64(ptr null, <2 x i1> zeroinitializer, <2 x i64> zeroinitializer)
+  call <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr null, <2 x i1> zeroinitializer, <2 x i64> zeroinitializer)
   ret void
 }
 
@@ -93,4 +93,4 @@ define ptr @constexpr_gep() {
 
 declare ptr @llvm.stacksave()
 declare void @llvm.stackprotector(ptr, ptr)
-declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <2 x i64> @llvm.masked.expandload.v2i64.p0(ptr, <2 x i1>, <2 x i64>)

diff  --git a/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll b/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll
index b458b23832c1e..493435ce11514 100644
--- a/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll
+++ b/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll
@@ -21,8 +21,8 @@ define void @combine_masked_load_store_from_constant_array(ptr %ptr) {
 
 define void @combine_masked_expandload_compressstore_from_constant_array(ptr %ptr) {
 ; CHECK-LABEL: @combine_masked_expandload_compressstore_from_constant_array(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <10 x i64> @llvm.masked.expandload.v10i64(ptr nonnull @contant_int_array, <10 x i1> splat (i1 true), <10 x i64> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v10i64(<10 x i64> [[TMP1]], ptr [[PTR:%.*]], <10 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = call <10 x i64> @llvm.masked.expandload.v10i64.p0(ptr nonnull @contant_int_array, <10 x i1> splat (i1 true), <10 x i64> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v10i64.p0(<10 x i64> [[TMP1]], ptr [[PTR:%.*]], <10 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
   %1 = alloca [10 x i64]
@@ -36,5 +36,5 @@ declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
 declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
 declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr, i32, <vscale x 2 x i1>)
 declare <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i32(i32, i32)
-declare <10 x i64> @llvm.masked.expandload.v10i64(ptr, <10 x i1>,  <10 x i64>)
+declare <10 x i64> @llvm.masked.expandload.v10i64.p0(ptr, <10 x i1>,  <10 x i64>)
 declare void @llvm.masked.compressstore.nxv10i64.p0(<10 x i64>, ptr, <10 x i1>)

diff  --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/SPIRV/expand-masked-compressstore.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/SPIRV/expand-masked-compressstore.ll
new file mode 100644
index 0000000000000..aedd0c2c012a0
--- /dev/null
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/SPIRV/expand-masked-compressstore.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=spirv64 | FileCheck %s
+
+; TODO: The pass is currently optimizing SPIRV like a CPU instead of a GPU which may not be optimal.
+
+define void @scalarize_v2i64(ptr addrspace(4) %p, <2 x i1> %mask, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64(
+; CHECK-NEXT:    [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
+; CHECK-NEXT:    [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[COND_STORE:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.store:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT:    store i64 [[TMP3]], ptr addrspace(4) [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr addrspace(4) [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi ptr addrspace(4) [ [[TMP4]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i2 [[SCALAR_MASK]], -2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i2 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[COND_STORE1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.store1:
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT:    store i64 [[TMP7]], ptr addrspace(4) [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.compressstore.v2i64.p4(<2 x i64> %data, ptr addrspace(4) %p, <2 x i1> %mask)
+  ret void
+}
+
+define void @scalarize_v2i64_ones_mask(ptr addrspace(4) %p, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(4) [[P:%.*]], i32 0
+; CHECK-NEXT:    store i64 [[ELT0]], ptr addrspace(4) [[TMP1]], align 1
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x i64> [[DATA]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr addrspace(4) [[P]], i32 1
+; CHECK-NEXT:    store i64 [[ELT1]], ptr addrspace(4) [[TMP2]], align 1
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.compressstore.v2i64.p4(<2 x i64> %data, ptr addrspace(4) %p, <2 x i1> <i1 true, i1 true>)
+  ret void
+}
+
+define void @scalarize_v2i64_zero_mask(ptr addrspace(4) %p, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.compressstore.v2i64.p4(<2 x i64> %data, ptr addrspace(4) %p, <2 x i1> <i1 false, i1 false>)
+  ret void
+}
+
+define void @scalarize_v2i64_const_mask(ptr addrspace(4) %p, <2 x i64> %data) {
+; CHECK-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(4) [[P:%.*]], i32 0
+; CHECK-NEXT:    store i64 [[ELT1]], ptr addrspace(4) [[TMP1]], align 1
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.compressstore.v2i64.p4(<2 x i64> %data, ptr addrspace(4) %p, <2 x i1> <i1 false, i1 true>)
+  ret void
+}
+
+declare void @llvm.masked.compressstore.v2i64.p4(<2 x i64>, ptr addrspace(4), <2 x i1>)

diff  --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/SPIRV/expand-masked-expandload.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/SPIRV/expand-masked-expandload.ll
new file mode 100644
index 0000000000000..eef5c8673686e
--- /dev/null
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/SPIRV/expand-masked-expandload.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=spirv64 | FileCheck %s
+
+; TODO: The pass is currently optimizing SPIRV like a CPU instead of a GPU which may not be optimal.
+
+define <2 x i64> @scalarize_v2i64(ptr addrspace(4) %p, <2 x i1> %mask, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64(
+; CHECK-NEXT:    [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
+; CHECK-NEXT:    [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
+; CHECK:       cond.load:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr addrspace(4) [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr addrspace(4) [[P]], i32 1
+; CHECK-NEXT:    br label [[ELSE]]
+; CHECK:       else:
+; CHECK-NEXT:    [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP4]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[PTR_PHI_ELSE:%.*]] = phi ptr addrspace(4) [ [[TMP5]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
+; CHECK:       cond.load1:
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr addrspace(4) [[PTR_PHI_ELSE]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP8]], i64 1
+; CHECK-NEXT:    br label [[ELSE2]]
+; CHECK:       else2:
+; CHECK-NEXT:    [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP9]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
+; CHECK-NEXT:    ret <2 x i64> [[RES_PHI_ELSE3]]
+;
+  %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p4(ptr addrspace(4) %p, <2 x i1> %mask, <2 x i64> %passthru)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @scalarize_v2i64_ones_mask(ptr addrspace(4) %p, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64_ones_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(4) [[P:%.*]], i32 0
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr addrspace(4) [[TMP1]], align 1
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <2 x i64> poison, i64 [[LOAD0]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr addrspace(4) [[P]], i32 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr addrspace(4) [[TMP2]], align 1
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <2 x i64> [[RES0]], i64 [[LOAD1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[RES1]], <2 x i64> [[PASSTHRU:%.*]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p4(ptr addrspace(4) %p, <2 x i1> <i1 true, i1 true>, <2 x i64> %passthru)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @scalarize_v2i64_zero_mask(ptr addrspace(4) %p, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64_zero_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> poison, <2 x i64> [[PASSTHRU:%.*]], <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p4(ptr addrspace(4) %p, <2 x i1> <i1 false, i1 false>, <2 x i64> %passthru)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @scalarize_v2i64_const_mask(ptr addrspace(4) %p, <2 x i64> %passthru) {
+; CHECK-LABEL: @scalarize_v2i64_const_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(4) [[P:%.*]], i32 0
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr addrspace(4) [[TMP1]], align 1
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <2 x i64> poison, i64 [[LOAD1]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[RES1]], <2 x i64> [[PASSTHRU:%.*]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p4(ptr addrspace(4) %p, <2 x i1> <i1 false, i1 true>, <2 x i64> %passthru)
+  ret <2 x i64> %ret
+}
+
+declare <2 x i64> @llvm.masked.expandload.v2i64.p4(ptr addrspace(4),  <2 x i1>, <2 x i64>)

diff  --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index 5acfec359e074..92f0193cdadad 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -1195,7 +1195,7 @@ def LLVM_masked_scatter : LLVM_ZeroResultIntrOp<"masked.scatter"> {
 
 /// Create a call to Masked Expand Load intrinsic.
 def LLVM_masked_expandload
-  : LLVM_OneResultIntrOp<"masked.expandload", [0], [],
+  : LLVM_OneResultIntrOp<"masked.expandload", [0], [0],
   /*traits=*/[], /*requiresFastMath=*/0, /*requiresArgAndResultAttrs=*/1,
   /*immArgPositions=*/[], /*immArgAttrNames=*/[]> {
   dag args = (ins LLVM_AnyPointer:$ptr,
@@ -1211,7 +1211,7 @@ def LLVM_masked_expandload
 
 /// Create a call to Masked Compress Store intrinsic.
 def LLVM_masked_compressstore
-    : LLVM_ZeroResultIntrOp<"masked.compressstore", [0],
+    : LLVM_ZeroResultIntrOp<"masked.compressstore", [0, 1],
   /*traits=*/[], /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
   /*requiresArgAndResultAttrs=*/1, /*requiresOpBundles=*/0,
   /*immArgPositions=*/[], /*immArgAttrNames=*/[]> {

diff  --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index 5865e046aa5ac..293a56a82b23c 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -585,10 +585,10 @@ llvm.func @masked_gather_scatter_intrinsics(%M: vector<7 x !llvm.ptr>, %mask: ve
 
 // CHECK-LABEL: @masked_expand_compress_intrinsics
 llvm.func @masked_expand_compress_intrinsics(%ptr: !llvm.ptr, %mask: vector<7xi1>, %passthru: vector<7xf32>) {
-  // CHECK: call <7 x float> @llvm.masked.expandload.v7f32(ptr %{{.*}}, <7 x i1> %{{.*}}, <7 x float> %{{.*}})
+  // CHECK: call <7 x float> @llvm.masked.expandload.v7f32.p0(ptr %{{.*}}, <7 x i1> %{{.*}}, <7 x float> %{{.*}})
   %0 = "llvm.intr.masked.expandload"(%ptr, %mask, %passthru)
     : (!llvm.ptr, vector<7xi1>, vector<7xf32>) -> (vector<7xf32>)
-  // CHECK: call void @llvm.masked.compressstore.v7f32(<7 x float> %{{.*}}, ptr %{{.*}}, <7 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.compressstore.v7f32.p0(<7 x float> %{{.*}}, ptr %{{.*}}, <7 x i1> %{{.*}})
   "llvm.intr.masked.compressstore"(%0, %ptr, %mask)
     : (vector<7xf32>, !llvm.ptr, vector<7xi1>) -> ()
   llvm.return
@@ -596,10 +596,10 @@ llvm.func @masked_expand_compress_intrinsics(%ptr: !llvm.ptr, %mask: vector<7xi1
 
 // CHECK-LABEL: @masked_expand_compress_intrinsics_with_alignment
 llvm.func @masked_expand_compress_intrinsics_with_alignment(%ptr: !llvm.ptr, %mask: vector<7xi1>, %passthru: vector<7xf32>) {
-  // CHECK: call <7 x float> @llvm.masked.expandload.v7f32(ptr align 8 %{{.*}}, <7 x i1> %{{.*}}, <7 x float> %{{.*}})
+  // CHECK: call <7 x float> @llvm.masked.expandload.v7f32.p0(ptr align 8 %{{.*}}, <7 x i1> %{{.*}}, <7 x float> %{{.*}})
   %0 = "llvm.intr.masked.expandload"(%ptr, %mask, %passthru) {arg_attrs = [{llvm.align = 8 : i32}, {}, {}]}
     : (!llvm.ptr, vector<7xi1>, vector<7xf32>) -> (vector<7xf32>)
-  // CHECK: call void @llvm.masked.compressstore.v7f32(<7 x float> %{{.*}}, ptr align 8 %{{.*}}, <7 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.compressstore.v7f32.p0(<7 x float> %{{.*}}, ptr align 8 %{{.*}}, <7 x i1> %{{.*}})
   "llvm.intr.masked.compressstore"(%0, %ptr, %mask) {arg_attrs = [{}, {llvm.align = 8 : i32}, {}]}
     : (vector<7xf32>, !llvm.ptr, vector<7xi1>) -> ()
   llvm.return
@@ -1475,8 +1475,8 @@ llvm.func @vector_scmp(%a: vector<4 x i32>, %b: vector<4 x i32>) -> vector<4 x i
 // CHECK-DAG: declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr captures(none), <7 x i1>)
 // CHECK-DAG: declare <7 x float> @llvm.masked.gather.v7f32.v7p0(<7 x ptr>, <7 x i1>, <7 x float>)
 // CHECK-DAG: declare void @llvm.masked.scatter.v7f32.v7p0(<7 x float>, <7 x ptr>, <7 x i1>)
-// CHECK-DAG: declare <7 x float> @llvm.masked.expandload.v7f32(ptr captures(none), <7 x i1>, <7 x float>)
-// CHECK-DAG: declare void @llvm.masked.compressstore.v7f32(<7 x float>, ptr captures(none), <7 x i1>)
+// CHECK-DAG: declare <7 x float> @llvm.masked.expandload.v7f32.p0(ptr captures(none), <7 x i1>, <7 x float>)
+// CHECK-DAG: declare void @llvm.masked.compressstore.v7f32.p0(<7 x float>, ptr captures(none), <7 x i1>)
 // CHECK-DAG: declare void @llvm.var.annotation.p0.p0(ptr, ptr, ptr, i32, ptr)
 // CHECK-DAG: declare ptr @llvm.ptr.annotation.p0.p0(ptr, ptr, ptr, i32, ptr)
 // CHECK-DAG: declare i16 @llvm.annotation.i16.p0(i16, ptr, ptr, i32)