[llvm] [TTI][CostModel] Add cost modeling for expandload and compressstore intrinsics (PR #122882)

Thu Jan 23 04:27:02 PST 2025

https://github.com/skachkov-sc updated https://github.com/llvm/llvm-project/pull/122882

>From d3da1a2b243e8cdf1dc7148a35470b6b7288679d Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Tue, 24 Dec 2024 15:12:25 +0300
Subject: [PATCH 1/3] [TTI][CostModel] Add cost modeling for expandload and
 compressstore intrinsics

---
 .../llvm/Analysis/TargetTransformInfo.h       |  24 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   7 +
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  23 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   9 +
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  37 ++
 .../Target/RISCV/RISCVTargetTransformInfo.h   |   5 +
 llvm/test/Analysis/CostModel/RISCV/gep.ll     |   8 +-
 .../Analysis/CostModel/RISCV/vp-intrinsics.ll | 174 ++++++
 .../X86/masked-intrinsic-codesize.ll          | 576 +++++++++---------
 .../X86/masked-intrinsic-cost-inseltpoison.ll | 576 +++++++++---------
 .../CostModel/X86/masked-intrinsic-cost.ll    | 576 +++++++++---------
 .../CostModel/X86/masked-intrinsic-latency.ll | 576 +++++++++---------
 .../X86/masked-intrinsic-sizelatency.ll       | 576 +++++++++---------
 13 files changed, 1723 insertions(+), 1444 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 752313ab15858c..7254b6d18cdf27 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1492,6 +1492,19 @@ class TargetTransformInfo {
       Align Alignment, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       const Instruction *I = nullptr) const;
 
+  /// \return The cost of Expand Load or Compress Store operation
+  /// \p Opcode - is a type of memory access Load or Store
+  /// \p Src - a vector type of the data to be loaded or stored
+  /// \p VariableMask - true when the memory access is predicated with a mask
+  ///                   that is not a compile-time constant
+  /// \p Alignment - alignment of single element
+  /// \p I - the optional original context instruction, if one exists, e.g. the
+  ///        load/store to transform or the call to the gather/scatter intrinsic
+  InstructionCost getConsecutiveMemoryOpCost(
+      unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+      const Instruction *I = nullptr) const;
+
   /// \return The cost of strided memory operations.
   /// \p Opcode - is a type of memory access Load or Store
   /// \p DataTy - a vector type of the data to be loaded or stored
@@ -2178,6 +2191,10 @@ class TargetTransformInfo::Concept {
                          TTI::TargetCostKind CostKind,
                          const Instruction *I = nullptr) = 0;
   virtual InstructionCost
+  getConsecutiveMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask,
+                             Align Alignment, TTI::TargetCostKind CostKind,
+                             const Instruction *I = nullptr) = 0;
+  virtual InstructionCost
   getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
                          bool VariableMask, Align Alignment,
                          TTI::TargetCostKind CostKind,
@@ -2898,6 +2915,13 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                                        Alignment, CostKind, I);
   }
   InstructionCost
+  getConsecutiveMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask,
+                             Align Alignment, TTI::TargetCostKind CostKind,
+                             const Instruction *I = nullptr) override {
+    return Impl.getConsecutiveMemoryOpCost(Opcode, DataTy, VariableMask,
+                                           Alignment, CostKind, I);
+  }
+  InstructionCost
   getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
                          bool VariableMask, Align Alignment,
                          TTI::TargetCostKind CostKind,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 9c74b2a0c31df1..38c5a5d66c3d16 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -765,6 +765,13 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
+  InstructionCost
+  getConsecutiveMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask,
+                             Align Alignment, TTI::TargetCostKind CostKind,
+                             const Instruction *I = nullptr) const {
+    return 1;
+  }
+
   InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
                                          const Value *Ptr, bool VariableMask,
                                          Align Alignment,
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c9f142d64ae9e4..b6a70dacc11ebb 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1469,6 +1469,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                        true, CostKind);
   }
 
+  InstructionCost getConsecutiveMemoryOpCost(unsigned Opcode, Type *DataTy,
+                                             bool VariableMask, Align Alignment,
+                                             TTI::TargetCostKind CostKind,
+                                             const Instruction *I = nullptr) {
+    return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
+                                       true, CostKind);
+  }
+
   InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
                                          const Value *Ptr, bool VariableMask,
                                          Align Alignment,
@@ -1777,6 +1785,21 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
                                              VarMask, Alignment, CostKind, I);
     }
+    case Intrinsic::masked_compressstore: {
+      const Value *Data = Args[0];
+      const Value *Mask = Args[2];
+      Align Alignment = I->getParamAlign(1).valueOrOne();
+      return thisT()->getConsecutiveMemoryOpCost(
+          Instruction::Store, Data->getType(), !isa<Constant>(Mask), Alignment,
+          CostKind, I);
+    }
+    case Intrinsic::masked_expandload: {
+      const Value *Mask = Args[1];
+      Align Alignment = I->getParamAlign(0).valueOrOne();
+      return thisT()->getConsecutiveMemoryOpCost(Instruction::Load, RetTy,
+                                                 !isa<Constant>(Mask),
+                                                 Alignment, CostKind, I);
+    }
     case Intrinsic::experimental_vp_strided_store: {
       const Value *Data = Args[0];
       const Value *Ptr = Args[1];
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index b32dffa9f0fe86..dc298ce469d96e 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1135,6 +1135,15 @@ InstructionCost TargetTransformInfo::getGatherScatterOpCost(
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getConsecutiveMemoryOpCost(
+    unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
+    TTI::TargetCostKind CostKind, const Instruction *I) const {
+  InstructionCost Cost = TTIImpl->getConsecutiveMemoryOpCost(
+      Opcode, DataTy, VariableMask, Alignment, CostKind, I);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost TargetTransformInfo::getStridedMemoryOpCost(
     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index add82dc80c4290..5e1893d632f8d5 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -838,6 +838,43 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
   return NumLoads * MemOpCost;
 }
 
+InstructionCost RISCVTTIImpl::getConsecutiveMemoryOpCost(
+    unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
+    TTI::TargetCostKind CostKind, const Instruction *I) {
+  bool IsLegal = (Opcode == Instruction::Store &&
+                  isLegalMaskedCompressStore(DataTy, Alignment)) ||
+                 (Opcode == Instruction::Load &&
+                  isLegalMaskedExpandLoad(DataTy, Alignment));
+  if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
+    return BaseT::getConsecutiveMemoryOpCost(Opcode, DataTy, VariableMask,
+                                             Alignment, CostKind, I);
+  // Example compressstore sequence:
+  // vsetivli        zero, 8, e32, m2, ta, ma (ignored)
+  // vcompress.vm    v10, v8, v0
+  // vcpop.m a1, v0
+  // vsetvli zero, a1, e32, m2, ta, ma
+  // vse32.v v10, (a0)
+  // Example expandload sequence:
+  // vsetivli        zero, 8, e8, mf2, ta, ma (ignored)
+  // vcpop.m a1, v0
+  // vsetvli zero, a1, e32, m2, ta, ma
+  // vle32.v v10, (a0)
+  // vsetivli        zero, 8, e32, m2, ta, ma
+  // viota.m v12, v0
+  // vrgather.vv     v8, v10, v12, v0.t
+  auto MemOpCost = getMemoryOpCost(Opcode, DataTy, Alignment, 0, CostKind);
+  auto LT = getTypeLegalizationCost(DataTy);
+  SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
+  if (VariableMask)
+    Opcodes.push_back(RISCV::VCPOP_M);
+  if (Opcode == Instruction::Store)
+    Opcodes.append({RISCV::VCOMPRESS_VM});
+  else
+    Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
+  return MemOpCost +
+         LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
+}
+
 InstructionCost RISCVTTIImpl::getStridedMemoryOpCost(
     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 9b364391f0fa47..4d110521424e4d 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -166,6 +166,11 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
                                          TTI::TargetCostKind CostKind,
                                          const Instruction *I);
 
+  InstructionCost getConsecutiveMemoryOpCost(unsigned Opcode, Type *Src,
+                                             bool VariableMask, Align Alignment,
+                                             TTI::TargetCostKind CostKind,
+                                             const Instruction *I = nullptr);
+
   InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
                                          const Value *Ptr, bool VariableMask,
                                          Align Alignment,
diff --git a/llvm/test/Analysis/CostModel/RISCV/gep.ll b/llvm/test/Analysis/CostModel/RISCV/gep.ll
index 44c0b5bb71070e..99c74c4635fc70 100644
--- a/llvm/test/Analysis/CostModel/RISCV/gep.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/gep.ll
@@ -268,7 +268,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> <i32 42, i32 43>
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> %3, i32 1, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = getelementptr i8, ptr %base, i32 42
@@ -280,7 +280,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> <i32 42, i32 43>
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> %9, i32 1, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = getelementptr i8, ptr %base, i32 42
@@ -338,7 +338,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %3 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> zeroinitializer
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> %3, i32 1, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %4 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %5 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %6 = getelementptr i8, ptr %base, i32 0
@@ -350,7 +350,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %9 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> zeroinitializer
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> %9, i32 1, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %10 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %11 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %12 = getelementptr i8, ptr %base, i32 0
diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index 5126a6a0a3cbcd..b18d6a067fca0f 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -1453,6 +1453,146 @@ define void @store() {
   ret void
 }
 
+define void @expand_load() {
+; CHECK-LABEL: 'expand_load'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 undef, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align 8 undef, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align 8 undef, <8 x i1> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align 8 undef, <16 x i1> undef, <16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr undef, <16 x i1> undef, <16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPEBASED-LABEL: 'expand_load'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr undef, <2 x i1> undef, <2 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr undef, <4 x i1> undef, <4 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 undef, <2 x i1> undef, <2 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align 8 undef, <4 x i1> undef, <4 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align 8 undef, <8 x i1> undef, <8 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align 8 undef, <16 x i1> undef, <16 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr undef, <16 x i1> undef, <16 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr undef, <2 x i1> undef, <2 x i8> undef)
+  %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr undef, <4 x i1> undef, <4 x i8> undef)
+  %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+  %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+  %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align(8) undef, <2 x i1> undef, <2 x i64> undef)
+  %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align(8) undef, <4 x i1> undef, <4 x i64> undef)
+  %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align(8) undef, <8 x i1> undef, <8 x i64> undef)
+  %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align(8) undef, <16 x i1> undef, <16 x i64> undef)
+  %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+  %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+  %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+  %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr undef, <16 x i1> undef, <16 x i64> undef)
+  %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+  %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+  %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+  %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+  %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align(8) undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+  %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align(8) undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+  %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align(8) undef, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
+  %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align(8) undef, <vscale x 16 x i1> undef, <vscale x 16 x i64> undef)
+  ret void
+}
+
+define void @compress_store() {
+; CHECK-LABEL: 'compress_store'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v4i8(<4 x i8> undef, ptr undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr align 8 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr align 8 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr align 8 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> undef, ptr align 8 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> undef, ptr undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> undef, ptr align 8 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> undef, ptr align 8 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> undef, ptr align 8 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> undef, ptr align 8 undef, <vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPEBASED-LABEL: 'compress_store'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr undef, <2 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i8(<4 x i8> undef, ptr undef, <4 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr align 8 undef, <2 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr align 8 undef, <4 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr align 8 undef, <8 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> undef, ptr align 8 undef, <16 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> undef, ptr undef, <16 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> undef, ptr align 8 undef, <vscale x 2 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> undef, ptr align 8 undef, <vscale x 4 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> undef, ptr align 8 undef, <vscale x 8 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> undef, ptr align 8 undef, <vscale x 16 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v4i8(<4 x i8> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr align(8) undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr align(8) undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr align(8) undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v16i64(<16 x i64> undef, ptr align(8) undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v16i64(<16 x i64> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef)
+  call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef)
+  call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef)
+  call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef)
+  call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> undef, ptr align(8) undef, <vscale x 2 x i1> undef)
+  call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> undef, ptr align(8) undef, <vscale x 4 x i1> undef)
+  call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> undef, ptr align(8) undef, <vscale x 8 x i1> undef)
+  call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> undef, ptr align(8) undef, <vscale x 16 x i1> undef)
+  ret void
+}
+
 define void @strided_load() {
 ; CHECK-LABEL: 'strided_load'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef)
@@ -2436,6 +2576,40 @@ declare void @llvm.vp.store.nxv4i64(<vscale x 4 x i64>, ptr, <vscale x 4 x i1>,
 declare void @llvm.vp.store.nxv8i64(<vscale x 8 x i64>, ptr, <vscale x 8 x i1>, i32)
 declare void @llvm.vp.store.nxv16i64(<vscale x 16 x i64>, ptr, <vscale x 16 x i1>, i32)
 
+declare <2 x i8> @llvm.masked.expandload.v2i8(ptr, <2 x i1>, <2 x i8>)
+declare <4 x i8> @llvm.masked.expandload.v4i8(ptr, <4 x i1>, <4 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <16 x i64> @llvm.masked.expandload.v16i64(ptr, <16 x i1>, <16 x i64>)
+declare <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr, <vscale x 4 x i1>, <vscale x 4 x i8>)
+declare <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr, <vscale x 8 x i1>, <vscale x 8 x i8>)
+declare <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr, <vscale x 16 x i1>, <vscale x 16 x i8>)
+declare <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr, <vscale x 4 x i1>, <vscale x 4 x i64>)
+declare <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr, <vscale x 8 x i1>, <vscale x 8 x i64>)
+declare <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr, <vscale x 16 x i1>, <vscale x 16 x i64>)
+
+declare void @llvm.masked.compressstore.v2i8(<2 x i8>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v4i8(<4 x i8>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v16i64(<16 x i64>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8>, ptr, <vscale x 2 x i1>)
+declare void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8>, ptr, <vscale x 4 x i1>)
+declare void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8>, ptr, <vscale x 8 x i1>)
+declare void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8>, ptr, <vscale x 16 x i1>)
+declare void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64>, ptr, <vscale x 2 x i1>)
+declare void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64>, ptr, <vscale x 4 x i1>)
+declare void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64>, ptr, <vscale x 8 x i1>)
+declare void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64>, ptr, <vscale x 16 x i1>)
+
 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.i64(ptr, i64, <2 x i1>, i32)
 declare <4 x i8> @llvm.experimental.vp.strided.load.v4i8.i64(ptr, i64, <4 x i1>, i32)
 declare <8 x i8> @llvm.experimental.vp.strided.load.v8i8.i64(ptr, i64, <8 x i1>, i32)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
index 47472d57228585..d49528d558ec88 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
@@ -1178,165 +1178,165 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 
 define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_expandload'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_expandload'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_expandload'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
@@ -1374,165 +1374,165 @@ define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1>
 
 define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
index e18a4d0467caf8..41f0287079b950 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
@@ -1178,165 +1178,165 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 
 define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_expandload'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 214 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_expandload'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_expandload'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
@@ -1374,165 +1374,165 @@ define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1>
 
 define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 214 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index 4ca1f7f419e6a5..6f90816fcdc8e9 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -1178,165 +1178,165 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 
 define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_expandload'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 214 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_expandload'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_expandload'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
@@ -1374,165 +1374,165 @@ define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1>
 
 define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 214 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
index 4fb2f1a50a6ed2..94b9b56c10cc76 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
@@ -1178,165 +1178,165 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 
 define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_expandload'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_expandload'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_expandload'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
@@ -1374,165 +1374,165 @@ define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1>
 
 define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
index 6be81602ab15bd..bea77c82b9f90d 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
@@ -1178,165 +1178,165 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 
 define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_expandload'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_expandload'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_expandload'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
@@ -1374,165 +1374,165 @@ define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1>
 
 define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)

>From adecf3a78fbe89391754de0603cafb33d08154a4 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Thu, 23 Jan 2025 13:40:50 +0300
Subject: [PATCH 2/3] Rename getConsecutiveMemoryOpCost to
 getExpandCompressMemoryOpCost

---
 .../llvm/Analysis/TargetTransformInfo.h       | 20 +++++++++----------
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  7 +++----
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      | 15 +++++++-------
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  4 ++--
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  6 +++---
 .../Target/RISCV/RISCVTargetTransformInfo.h   |  9 +++++----
 6 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 7254b6d18cdf27..27d22ed510f46a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1500,7 +1500,7 @@ class TargetTransformInfo {
   /// \p Alignment - alignment of single element
   /// \p I - the optional original context instruction, if one exists, e.g. the
   ///        load/store to transform or the call to the gather/scatter intrinsic
-  InstructionCost getConsecutiveMemoryOpCost(
+  InstructionCost getExpandCompressMemoryOpCost(
       unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       const Instruction *I = nullptr) const;
@@ -2190,10 +2190,9 @@ class TargetTransformInfo::Concept {
                          bool VariableMask, Align Alignment,
                          TTI::TargetCostKind CostKind,
                          const Instruction *I = nullptr) = 0;
-  virtual InstructionCost
-  getConsecutiveMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask,
-                             Align Alignment, TTI::TargetCostKind CostKind,
-                             const Instruction *I = nullptr) = 0;
+  virtual InstructionCost getExpandCompressMemoryOpCost(
+      unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
+      TTI::TargetCostKind CostKind, const Instruction *I = nullptr) = 0;
   virtual InstructionCost
   getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
                          bool VariableMask, Align Alignment,
@@ -2914,12 +2913,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                        Alignment, CostKind, I);
   }
-  InstructionCost
-  getConsecutiveMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask,
-                             Align Alignment, TTI::TargetCostKind CostKind,
-                             const Instruction *I = nullptr) override {
-    return Impl.getConsecutiveMemoryOpCost(Opcode, DataTy, VariableMask,
-                                           Alignment, CostKind, I);
+  InstructionCost getExpandCompressMemoryOpCost(
+      unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
+      TTI::TargetCostKind CostKind, const Instruction *I = nullptr) override {
+    return Impl.getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,
+                                              Alignment, CostKind, I);
   }
   InstructionCost
   getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 38c5a5d66c3d16..c605c6e1396c3f 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -765,10 +765,9 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
-  InstructionCost
-  getConsecutiveMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask,
-                             Align Alignment, TTI::TargetCostKind CostKind,
-                             const Instruction *I = nullptr) const {
+  InstructionCost getExpandCompressMemoryOpCost(
+      unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
+      TTI::TargetCostKind CostKind, const Instruction *I = nullptr) const {
     return 1;
   }
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index b6a70dacc11ebb..1f88ce82f1f8f1 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1469,10 +1469,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                        true, CostKind);
   }
 
-  InstructionCost getConsecutiveMemoryOpCost(unsigned Opcode, Type *DataTy,
-                                             bool VariableMask, Align Alignment,
-                                             TTI::TargetCostKind CostKind,
-                                             const Instruction *I = nullptr) {
+  InstructionCost getExpandCompressMemoryOpCost(
+      unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
+      TTI::TargetCostKind CostKind, const Instruction *I = nullptr) {
     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
                                        true, CostKind);
   }
@@ -1789,16 +1788,16 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       const Value *Data = Args[0];
       const Value *Mask = Args[2];
       Align Alignment = I->getParamAlign(1).valueOrOne();
-      return thisT()->getConsecutiveMemoryOpCost(
+      return thisT()->getExpandCompressMemoryOpCost(
           Instruction::Store, Data->getType(), !isa<Constant>(Mask), Alignment,
           CostKind, I);
     }
     case Intrinsic::masked_expandload: {
       const Value *Mask = Args[1];
       Align Alignment = I->getParamAlign(0).valueOrOne();
-      return thisT()->getConsecutiveMemoryOpCost(Instruction::Load, RetTy,
-                                                 !isa<Constant>(Mask),
-                                                 Alignment, CostKind, I);
+      return thisT()->getExpandCompressMemoryOpCost(Instruction::Load, RetTy,
+                                                    !isa<Constant>(Mask),
+                                                    Alignment, CostKind, I);
     }
     case Intrinsic::experimental_vp_strided_store: {
       const Value *Data = Args[0];
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index dc298ce469d96e..74bf312a1f4e1d 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1135,10 +1135,10 @@ InstructionCost TargetTransformInfo::getGatherScatterOpCost(
   return Cost;
 }
 
-InstructionCost TargetTransformInfo::getConsecutiveMemoryOpCost(
+InstructionCost TargetTransformInfo::getExpandCompressMemoryOpCost(
     unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
     TTI::TargetCostKind CostKind, const Instruction *I) const {
-  InstructionCost Cost = TTIImpl->getConsecutiveMemoryOpCost(
+  InstructionCost Cost = TTIImpl->getExpandCompressMemoryOpCost(
       Opcode, DataTy, VariableMask, Alignment, CostKind, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 5e1893d632f8d5..326edc1955e435 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -838,7 +838,7 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
   return NumLoads * MemOpCost;
 }
 
-InstructionCost RISCVTTIImpl::getConsecutiveMemoryOpCost(
+InstructionCost RISCVTTIImpl::getExpandCompressMemoryOpCost(
     unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
     TTI::TargetCostKind CostKind, const Instruction *I) {
   bool IsLegal = (Opcode == Instruction::Store &&
@@ -846,8 +846,8 @@ InstructionCost RISCVTTIImpl::getConsecutiveMemoryOpCost(
                  (Opcode == Instruction::Load &&
                   isLegalMaskedExpandLoad(DataTy, Alignment));
   if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
-    return BaseT::getConsecutiveMemoryOpCost(Opcode, DataTy, VariableMask,
-                                             Alignment, CostKind, I);
+    return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,
+                                                Alignment, CostKind, I);
   // Example compressstore sequence:
   // vsetivli        zero, 8, e32, m2, ta, ma (ignored)
   // vcompress.vm    v10, v8, v0
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 4d110521424e4d..e91c0a0b3aee11 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -166,10 +166,11 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
                                          TTI::TargetCostKind CostKind,
                                          const Instruction *I);
 
-  InstructionCost getConsecutiveMemoryOpCost(unsigned Opcode, Type *Src,
-                                             bool VariableMask, Align Alignment,
-                                             TTI::TargetCostKind CostKind,
-                                             const Instruction *I = nullptr);
+  InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src,
+                                                bool VariableMask,
+                                                Align Alignment,
+                                                TTI::TargetCostKind CostKind,
+                                                const Instruction *I = nullptr);
 
   InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy,
                                          const Value *Ptr, bool VariableMask,

>From 29349d74ce738d44de74f893229afdb0064ace8e Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Thu, 23 Jan 2025 15:06:41 +0300
Subject: [PATCH 3/3] Use poison instead of undef

---
 .../Analysis/CostModel/RISCV/vp-intrinsics.ll | 240 +++++++++---------
 1 file changed, 120 insertions(+), 120 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index b18d6a067fca0f..eb3237893e2e75 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -1455,141 +1455,141 @@ define void @store() {
 
 define void @expand_load() {
 ; CHECK-LABEL: 'expand_load'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr undef, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr undef, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 undef, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align 8 undef, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align 8 undef, <8 x i1> undef, <8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align 8 undef, <16 x i1> undef, <16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr undef, <16 x i1> undef, <16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 poison, <2 x i1> poison, <2 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align 8 poison, <4 x i1> poison, <4 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align 8 poison, <8 x i1> poison, <8 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align 8 poison, <16 x i1> poison, <16 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr poison, <8 x i1> poison, <8 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr poison, <16 x i1> poison, <16 x i64> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align 8 poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align 8 poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align 8 poison, <vscale x 8 x i1> poison, <vscale x 8 x i64> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align 8 poison, <vscale x 16 x i1> poison, <vscale x 16 x i64> poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'expand_load'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr undef, <2 x i1> undef, <2 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr undef, <4 x i1> undef, <4 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 undef, <2 x i1> undef, <2 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align 8 undef, <4 x i1> undef, <4 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align 8 undef, <8 x i1> undef, <8 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align 8 undef, <16 x i1> undef, <16 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr undef, <16 x i1> undef, <16 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i64> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 poison, <2 x i1> poison, <2 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align 8 poison, <4 x i1> poison, <4 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align 8 poison, <8 x i1> poison, <8 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align 8 poison, <16 x i1> poison, <16 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr poison, <8 x i1> poison, <8 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr poison, <16 x i1> poison, <16 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align 8 poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align 8 poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align 8 poison, <vscale x 8 x i1> poison, <vscale x 8 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align 8 poison, <vscale x 16 x i1> poison, <vscale x 16 x i64> poison)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr undef, <2 x i1> undef, <2 x i8> undef)
-  %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr undef, <4 x i1> undef, <4 x i8> undef)
-  %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
-  %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
-  %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align(8) undef, <2 x i1> undef, <2 x i64> undef)
-  %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align(8) undef, <4 x i1> undef, <4 x i64> undef)
-  %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align(8) undef, <8 x i1> undef, <8 x i64> undef)
-  %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align(8) undef, <16 x i1> undef, <16 x i64> undef)
-  %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
-  %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
-  %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
-  %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr undef, <16 x i1> undef, <16 x i64> undef)
-  %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-  %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-  %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-  %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-  %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align(8) undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-  %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align(8) undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-  %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align(8) undef, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
-  %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align(8) undef, <vscale x 16 x i1> undef, <vscale x 16 x i64> undef)
+  %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
+  %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
+  %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
+  %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
+  %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align(8) poison, <2 x i1> poison, <2 x i64> poison)
+  %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align(8) poison, <4 x i1> poison, <4 x i64> poison)
+  %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align(8) poison, <8 x i1> poison, <8 x i64> poison)
+  %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align(8) poison, <16 x i1> poison, <16 x i64> poison)
+  %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
+  %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
+  %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr poison, <8 x i1> poison, <8 x i64> poison)
+  %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr poison, <16 x i1> poison, <16 x i64> poison)
+  %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+  %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+  %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+  %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+  %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align(8) poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+  %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align(8) poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+  %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align(8) poison, <vscale x 8 x i1> poison, <vscale x 8 x i64> poison)
+  %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align(8) poison, <vscale x 16 x i1> poison, <vscale x 16 x i64> poison)
   ret void
 }
 
 define void @compress_store() {
 ; CHECK-LABEL: 'compress_store'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr undef, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v4i8(<4 x i8> undef, ptr undef, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr align 8 undef, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr align 8 undef, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr align 8 undef, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> undef, ptr align 8 undef, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> undef, ptr undef, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> undef, ptr align 8 undef, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> undef, ptr align 8 undef, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> undef, ptr align 8 undef, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> undef, ptr align 8 undef, <vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> poison, ptr poison, <2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v4i8(<4 x i8> poison, ptr poison, <4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> poison, ptr poison, <8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> poison, ptr poison, <16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr align 8 poison, <2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr align 8 poison, <4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr align 8 poison, <8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr align 8 poison, <16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr poison, <2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr poison, <4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr poison, <8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr poison, <16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> poison, ptr poison, <vscale x 2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> poison, ptr poison, <vscale x 4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> poison, ptr poison, <vscale x 8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> poison, ptr poison, <vscale x 16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> poison, ptr align 8 poison, <vscale x 2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> poison, ptr align 8 poison, <vscale x 4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> poison, ptr align 8 poison, <vscale x 8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> poison, ptr align 8 poison, <vscale x 16 x i1> poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'compress_store'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr undef, <2 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i8(<4 x i8> undef, ptr undef, <4 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr align 8 undef, <2 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr align 8 undef, <4 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr align 8 undef, <8 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> undef, ptr align 8 undef, <16 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> undef, ptr undef, <16 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> undef, ptr align 8 undef, <vscale x 2 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> undef, ptr align 8 undef, <vscale x 4 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> undef, ptr align 8 undef, <vscale x 8 x i1> undef)
-; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> undef, ptr align 8 undef, <vscale x 16 x i1> undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> poison, ptr poison, <2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i8(<4 x i8> poison, ptr poison, <4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> poison, ptr poison, <8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> poison, ptr poison, <16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr align 8 poison, <2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr align 8 poison, <4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr align 8 poison, <8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr align 8 poison, <16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr poison, <2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr poison, <4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr poison, <8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr poison, <16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> poison, ptr poison, <vscale x 2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> poison, ptr poison, <vscale x 4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> poison, ptr poison, <vscale x 8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> poison, ptr poison, <vscale x 16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> poison, ptr align 8 poison, <vscale x 2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> poison, ptr align 8 poison, <vscale x 4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> poison, ptr align 8 poison, <vscale x 8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> poison, ptr align 8 poison, <vscale x 16 x i1> poison)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr undef, <2 x i1> undef)
-  call void @llvm.masked.compressstore.v4i8(<4 x i8> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr align(8) undef, <2 x i1> undef)
-  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr align(8) undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr align(8) undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v16i64(<16 x i64> undef, ptr align(8) undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
-  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
-  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
-  call void @llvm.masked.compressstore.v16i64(<16 x i64> undef, ptr undef, <16 x i1> undef)
-  call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> undef, ptr undef, <vscale x 2 x i1> undef)
-  call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> undef, ptr undef, <vscale x 4 x i1> undef)
-  call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> undef, ptr undef, <vscale x 8 x i1> undef)
-  call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> undef, ptr undef, <vscale x 16 x i1> undef)
-  call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> undef, ptr align(8) undef, <vscale x 2 x i1> undef)
-  call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> undef, ptr align(8) undef, <vscale x 4 x i1> undef)
-  call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> undef, ptr align(8) undef, <vscale x 8 x i1> undef)
-  call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> undef, ptr align(8) undef, <vscale x 16 x i1> undef)
+  call void @llvm.masked.compressstore.v2i8(<2 x i8> poison, ptr poison, <2 x i1> poison)
+  call void @llvm.masked.compressstore.v4i8(<4 x i8> poison, ptr poison, <4 x i1> poison)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> poison, ptr poison, <8 x i1> poison)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> poison, ptr poison, <16 x i1> poison)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr align(8) poison, <2 x i1> poison)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr align(8) poison, <4 x i1> poison)
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr align(8) poison, <8 x i1> poison)
+  call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr align(8) poison, <16 x i1> poison)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr poison, <2 x i1> poison)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr poison, <4 x i1> poison)
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr poison, <8 x i1> poison)
+  call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr poison, <16 x i1> poison)
+  call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> poison, ptr poison, <vscale x 2 x i1> poison)
+  call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> poison, ptr poison, <vscale x 4 x i1> poison)
+  call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> poison, ptr poison, <vscale x 8 x i1> poison)
+  call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> poison, ptr poison, <vscale x 16 x i1> poison)
+  call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> poison, ptr align(8) poison, <vscale x 2 x i1> poison)
+  call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> poison, ptr align(8) poison, <vscale x 4 x i1> poison)
+  call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> poison, ptr align(8) poison, <vscale x 8 x i1> poison)
+  call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> poison, ptr align(8) poison, <vscale x 16 x i1> poison)
   ret void
 }