[llvm] r258675 - AVX1 : Enable vector masked_load/store to AVX1.

Igor Breger via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 25 02:17:11 PST 2016


Author: ibreger
Date: Mon Jan 25 04:17:11 2016
New Revision: 258675

URL: http://llvm.org/viewvc/llvm-project?rev=258675&view=rev
Log:
AVX1 : Enable vector masked_load/store to AVX1.
Use AVX1 FP instructions (vmaskmovps/pd) in place of the AVX2 int instructions (vpmaskmovd/q).

Differential Revision: http://reviews.llvm.org/D16528

Modified:
    llvm/trunk/lib/Target/X86/X86InstrSSE.td
    llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/trunk/test/CodeGen/X86/masked_memop.ll
    llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll

Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=258675&r1=258674&r2=258675&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Mon Jan 25 04:17:11 2016
@@ -8703,116 +8703,47 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskm
                                 int_x86_avx2_maskstore_q,
                                 int_x86_avx2_maskstore_q_256>, VEX_W;
 
-def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
-         (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>;
-
-def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
-         (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
-
-def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)),
-         (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>;
-
-def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)),
-         (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
-         (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask),
-                             (bc_v8f32 (v8i32 immAllZerosV)))),
-         (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))),
-         (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr),
-                       VR256:$mask)>;
-
-def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
-         (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))),
-         (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))),
-         (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr),
-                       VR256:$mask)>;
-
-def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
-         (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask),
-                             (bc_v4f32 (v4i32 immAllZerosV)))),
-         (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))),
-         (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr),
-                       VR128:$mask)>;
-
-def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
-         (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))),
-         (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))),
-         (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr),
-                       VR128:$mask)>;
-
-def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
-         (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
-
-def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
-         (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
-
-def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
-         (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
-                             (v4f64 immAllZerosV))),
-         (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))),
-         (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr),
-                       VR256:$mask)>;
-
-def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
-         (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
-                             (bc_v4i64 (v8i32 immAllZerosV)))),
-         (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
-
-def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))),
-         (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
-                       VR256:$mask)>;
-
-def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)),
-         (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>;
-
-def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)),
-         (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>;
-
-def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
-         (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
-                             (v2f64 immAllZerosV))),
-         (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))),
-         (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr),
-                       VR128:$mask)>;
-
-def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
-         (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
-                             (bc_v2i64 (v4i32 immAllZerosV)))),
-         (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
-
-def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))),
-         (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr),
-                       VR128:$mask)>;
-
+multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
+                          ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
+    // masked store
+    def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)),
+             (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
+    // masked load
+    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
+             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
+    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
+                              (VT (bitconvert (ZeroVT immAllZerosV))))),
+             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
+    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
+             (!cast<Instruction>(BlendStr#"rr") 
+                 RC:$src0, 
+                 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr),
+                 RC:$mask)>; 
+}
+let Predicates = [HasAVX] in {
+  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
+  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
+}
+let Predicates = [HasAVX1Only] in {
+  // zero vector created as v8f32 (base on X86TargetLowering::LowerBUILD_VECTOR)
+  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8f32>;
+  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8f32>;
+  // load/store i32/i64 not supported use ps/pd version
+  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
+  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8f32>;
+  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
+  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+}
+let Predicates = [HasAVX2] in {
+  // zero vector created as v8i32 (base on X86TargetLowering::LowerBUILD_VECTOR)
+  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
+  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
+
+  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
+  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
+  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
+  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+}
 //===----------------------------------------------------------------------===//
 // Variable Bit Shifts
 //

Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=258675&r1=258674&r2=258675&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Mon Jan 25 04:17:11 2016
@@ -1438,7 +1438,7 @@ bool X86TTIImpl::isLegalMaskedLoad(Type
   int DataWidth = isa<PointerType>(ScalarTy) ?
     DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
 
-  return (DataWidth >= 32 && ST->hasAVX2());
+  return (DataWidth >= 32 && ST->hasAVX());
 }
 
 bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {

Modified: llvm/trunk/test/CodeGen/X86/masked_memop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_memop.ll?rev=258675&r1=258674&r2=258675&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_memop.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_memop.ll Mon Jan 25 04:17:11 2016
@@ -1,20 +1,29 @@
-; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX1
-; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX2
-; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512
-; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=SKX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX  --check-prefix=AVX1
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX
 
-; FIXME: AVX1 supports vmaskmovp[s/d], so its codegen should be identical to AVX2 for FP cases.
-; For integer cases, AVX1 could use the FP instructions in place of vpmaskmov?
-
-; To test for the case where masked load/store is not legal, we should add a run with a target 
+; To test for the case where masked load/store is not legal, we should add a run with a target
 ; that does not have AVX, but that case should probably be a separate test file using less tests
-; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov. 
+; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov.
 
 define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
-; Bypassing exact checking here because it's over 300 lines.
 ; AVX1-LABEL: test1:
-; AVX1-NOT:   maskmov
-
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmaskmovps 32(%rdi), %ymm1, %ymm1
+; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
 ; AVX2-LABEL: test1:
 ; AVX2:       ## BB#0:
 ; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
@@ -30,22 +39,26 @@ define <16 x i32> @test1(<16 x i32> %tri
 ; AVX512-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
 ; AVX512-NEXT:    retq
-;
-; SKX-LABEL: test1:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
-; SKX-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
-; SKX-NEXT:    retq
   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
   %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
-; Bypassing exact checking here because it's over 300 lines.
 ; AVX1-LABEL: test2:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmaskmovps 32(%rdi), %ymm1, %ymm1
+; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test2:
 ; AVX2:       ## BB#0:
@@ -62,22 +75,27 @@ define <16 x i32> @test2(<16 x i32> %tri
 ; AVX512-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
 ; AVX512-NEXT:    retq
-;
-; SKX-LABEL: test2:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
-; SKX-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
-; SKX-NEXT:    retq
   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
   %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer)
   ret <16 x i32> %res
 }
 
 define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
-; Bypassing exact checking here because it's over 300 lines.
 ; AVX1-LABEL: test3:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vmaskmovps %ymm3, %ymm1, 32(%rdi)
+; AVX1-NEXT:    vmaskmovps %ymm2, %ymm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test3:
 ; AVX2:       ## BB#0:
@@ -95,22 +113,28 @@ define void @test3(<16 x i32> %trigger,
 ; AVX512-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
 ; AVX512-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
 ; AVX512-NEXT:    retq
-;
-; SKX-LABEL: test3:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
-; SKX-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
-; SKX-NEXT:    retq
   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
   ret void
 }
 
 define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) {
-; Bypassing exact checking here because it's over 300 lines.
 ; AVX1-LABEL: test4:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm4
+; AVX1-NEXT:    vblendvps %ymm0, %ymm4, %ymm2, %ymm0
+; AVX1-NEXT:    vmaskmovps 32(%rdi), %ymm1, %ymm2
+; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm3, %ymm1
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test4:
 ; AVX2:       ## BB#0:
@@ -130,23 +154,31 @@ define <16 x float> @test4(<16 x i32> %t
 ; AVX512-NEXT:    vmovups (%rdi), %zmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512-NEXT:    retq
-;
-; SKX-LABEL: test4:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
-; SKX-NEXT:    vmovups (%rdi), %zmm1 {%k1}
-; SKX-NEXT:    vmovaps %zmm1, %zmm0
-; SKX-NEXT:    retq
   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
   %res = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst)
   ret <16 x float> %res
 }
 
 define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) {
-; Bypassing exact checking here because it's over 100 lines.
 ; AVX1-LABEL: test5:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm4
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
+; AVX1-NEXT:    vmaskmovpd 32(%rdi), %ymm3, %ymm1
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test5:
 ; AVX2:       ## BB#0:
@@ -162,13 +194,13 @@ define <8 x double> @test5(<8 x i32> %tr
 ; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test5:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX512-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
-; AVX512-NEXT:    vmovupd (%rdi), %zmm1 {%k1}
-; AVX512-NEXT:    vmovaps %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test5:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512F-NEXT:    vmovupd (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test5:
 ; SKX:       ## BB#0:
@@ -183,43 +215,21 @@ define <8 x double> @test5(<8 x i32> %tr
 }
 
 define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
-; AVX1-LABEL: test6:
-; AVX1:       ## BB#0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpextrb $0, %xmm3, %eax
-; AVX1-NEXT:    ## implicit-def: %XMM2
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB5_2
-; AVX1-NEXT:  ## BB#1: ## %cond.load
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT:  LBB5_2: ## %else
-; AVX1-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB5_4
-; AVX1-NEXT:  ## BB#3: ## %cond.load1
-; AVX1-NEXT:    vmovhpd 8(%rdi), %xmm2, %xmm2
-; AVX1-NEXT:  LBB5_4: ## %else2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test6:
-; AVX2:       ## BB#0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2
-; AVX2-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test6:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2
-; AVX512-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: test6:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: test6:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test6:
 ; SKX:       ## BB#0:
@@ -234,57 +244,21 @@ define <2 x double> @test6(<2 x i64> %tr
 }
 
 define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
-; AVX1-LABEL: test7:
-; AVX1:       ## BB#0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpextrb $0, %xmm3, %eax
-; AVX1-NEXT:    ## implicit-def: %XMM2
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB6_2
-; AVX1-NEXT:  ## BB#1: ## %cond.load
-; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX1-NEXT:  LBB6_2: ## %else
-; AVX1-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB6_4
-; AVX1-NEXT:  ## BB#3: ## %cond.load1
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; AVX1-NEXT:  LBB6_4: ## %else2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB6_6
-; AVX1-NEXT:  ## BB#5: ## %cond.load4
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; AVX1-NEXT:  LBB6_6: ## %else5
-; AVX1-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB6_8
-; AVX1-NEXT:  ## BB#7: ## %cond.load7
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
-; AVX1-NEXT:  LBB6_8: ## %else8
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test7:
-; AVX2:       ## BB#0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test7:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX512-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: test7:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: test7:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test7:
 ; SKX:       ## BB#0:
@@ -302,36 +276,8 @@ define <4 x i32> @test8(<4 x i32> %trigg
 ; AVX1-LABEL: test8:
 ; AVX1:       ## BB#0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpextrb $0, %xmm3, %eax
-; AVX1-NEXT:    ## implicit-def: %XMM2
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB7_2
-; AVX1-NEXT:  ## BB#1: ## %cond.load
-; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX1-NEXT:  LBB7_2: ## %else
-; AVX1-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB7_4
-; AVX1-NEXT:  ## BB#3: ## %cond.load1
-; AVX1-NEXT:    vpinsrd $1, 4(%rdi), %xmm2, %xmm2
-; AVX1-NEXT:  LBB7_4: ## %else2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB7_6
-; AVX1-NEXT:  ## BB#5: ## %cond.load4
-; AVX1-NEXT:    vpinsrd $2, 8(%rdi), %xmm2, %xmm2
-; AVX1-NEXT:  LBB7_6: ## %else5
-; AVX1-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB7_8
-; AVX1-NEXT:  ## BB#7: ## %cond.load7
-; AVX1-NEXT:    vpinsrd $3, 12(%rdi), %xmm2, %xmm2
-; AVX1-NEXT:  LBB7_8: ## %else8
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -343,13 +289,13 @@ define <4 x i32> @test8(<4 x i32> %trigg
 ; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test8:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
-; AVX512-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test8:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test8:
 ; SKX:       ## BB#0:
@@ -367,33 +313,8 @@ define void @test9(<4 x i32> %trigger, <
 ; AVX1-LABEL: test9:
 ; AVX1:       ## BB#0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB8_2
-; AVX1-NEXT:  ## BB#1: ## %cond.store
-; AVX1-NEXT:    vmovd %xmm1, (%rdi)
-; AVX1-NEXT:  LBB8_2: ## %else
-; AVX1-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB8_4
-; AVX1-NEXT:  ## BB#3: ## %cond.store1
-; AVX1-NEXT:    vpextrd $1, %xmm1, 4(%rdi)
-; AVX1-NEXT:  LBB8_4: ## %else2
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB8_6
-; AVX1-NEXT:  ## BB#5: ## %cond.store3
-; AVX1-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
-; AVX1-NEXT:  LBB8_6: ## %else4
-; AVX1-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB8_8
-; AVX1-NEXT:  ## BB#7: ## %cond.store5
-; AVX1-NEXT:    vpextrd $3, %xmm1, 12(%rdi)
-; AVX1-NEXT:  LBB8_8: ## %else6
+; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test9:
@@ -403,12 +324,12 @@ define void @test9(<4 x i32> %trigger, <
 ; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test9:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test9:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test9:
 ; SKX:       ## BB#0:
@@ -425,45 +346,12 @@ define <4 x double> @test10(<4 x i32> %t
 ; AVX1-LABEL: test10:
 ; AVX1:       ## BB#0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vpextrb $0, %xmm3, %eax
-; AVX1-NEXT:    ## implicit-def: %YMM2
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB9_2
-; AVX1-NEXT:  ## BB#1: ## %cond.load
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT:  LBB9_2: ## %else
-; AVX1-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB9_4
-; AVX1-NEXT:  ## BB#3: ## %cond.load1
-; AVX1-NEXT:    vmovhpd 8(%rdi), %xmm2, %xmm3
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
-; AVX1-NEXT:  LBB9_4: ## %else2
-; AVX1-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB9_6
-; AVX1-NEXT:  ## BB#5: ## %cond.load4
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vmovlpd 16(%rdi), %xmm4, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT:  LBB9_6: ## %else5
-; AVX1-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB9_8
-; AVX1-NEXT:  ## BB#7: ## %cond.load7
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vmovhpd 24(%rdi), %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:  LBB9_8: ## %else8
-; AVX1-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
 ; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -476,14 +364,14 @@ define <4 x double> @test10(<4 x i32> %t
 ; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test10:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
-; AVX512-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
-; AVX512-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test10:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX512F-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2
+; AVX512F-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test10:
 ; SKX:       ## BB#0:
@@ -497,10 +385,56 @@ define <4 x double> @test10(<4 x i32> %t
   ret <4 x double> %res
 }
 
+define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
+; AVX1-LABEL: test10b:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test10b:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test10b:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX512F-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; SKX-LABEL: test10b:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; SKX-NEXT:    vmovapd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer)
+  ret <4 x double> %res
+}
+
 define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
-; Bypassing exact checking here because it's over 100 lines.
 ; AVX1-LABEL: test11a:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
+; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test11a:
 ; AVX2:       ## BB#0:
@@ -510,15 +444,15 @@ define <8 x float> @test11a(<8 x i32> %t
 ; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test11a:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX512-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
-; AVX512-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512-NEXT:    vmovups (%rdi), %zmm1 {%k1}
-; AVX512-NEXT:    vmovaps %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test11a:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512F-NEXT:    vmovups (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test11a:
 ; SKX:       ## BB#0:
@@ -533,9 +467,18 @@ define <8 x float> @test11a(<8 x i32> %t
 }
 
 define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
-; Bypassing exact checking here because it's over 70 lines.
 ; AVX1-LABEL: test11b:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2
+; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test11b:
 ; AVX2:       ## BB#0:
@@ -546,16 +489,16 @@ define <8 x i32> @test11b(<8 x i1> %mask
 ; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test11b:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512-NEXT:    vmovdqu32 (%rdi), %zmm1 {%k1}
-; AVX512-NEXT:    vmovaps %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test11b:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test11b:
 ; SKX:       ## BB#0:
@@ -569,9 +512,17 @@ define <8 x i32> @test11b(<8 x i1> %mask
 }
 
 define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
-; Bypassing exact checking here because it's over 70 lines.
 ; AVX1-LABEL: test11c:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test11c:
 ; AVX2:       ## BB#0:
@@ -581,15 +532,15 @@ define <8 x float> @test11c(<8 x i1> %ma
 ; AVX2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test11c:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test11c:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512F-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test11c:
 ; SKX:       ## BB#0:
@@ -602,9 +553,17 @@ define <8 x float> @test11c(<8 x i1> %ma
 }
 
 define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
-; Bypassing exact checking here because it's over 70 lines.
 ; AVX1-LABEL: test11d:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test11d:
 ; AVX2:       ## BB#0:
@@ -614,15 +573,15 @@ define <8 x i32> @test11d(<8 x i1> %mask
 ; AVX2-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test11d:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; AVX512-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test11d:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test11d:
 ; SKX:       ## BB#0:
@@ -635,9 +594,16 @@ define <8 x i32> @test11d(<8 x i1> %mask
 }
 
 define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
-; Bypassing exact checking here because it's over 90 lines.
 ; AVX1-LABEL: test12:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test12:
 ; AVX2:       ## BB#0:
@@ -647,14 +613,14 @@ define void @test12(<8 x i32> %trigger,
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test12:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX512-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
-; AVX512-NEXT:    kshiftlw $8, %k0, %k0
-; AVX512-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test12:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test12:
 ; SKX:       ## BB#0:
@@ -668,9 +634,21 @@ define void @test12(<8 x i32> %trigger,
 }
 
 define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) {
-; Bypassing exact checking here because it's over 300 lines.
 ; AVX1-LABEL: test13:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vmaskmovps %ymm3, %ymm1, 32(%rdi)
+; AVX1-NEXT:    vmaskmovps %ymm2, %ymm0, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test13:
 ; AVX2:       ## BB#0:
@@ -688,13 +666,6 @@ define void @test13(<16 x i32> %trigger,
 ; AVX512-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
 ; AVX512-NEXT:    vmovups %zmm1, (%rdi) {%k1}
 ; AVX512-NEXT:    retq
-;
-; SKX-LABEL: test13:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
-; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
-; SKX-NEXT:    vmovups %zmm1, (%rdi) {%k1}
-; SKX-NEXT:    retq
   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
   ret void
@@ -704,22 +675,11 @@ define void @test14(<2 x i32> %trigger,
 ; AVX1-LABEL: test14:
 ; AVX1:       ## BB#0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrb $0, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB16_2
-; AVX1-NEXT:  ## BB#1: ## %cond.store
-; AVX1-NEXT:    vmovss %xmm1, (%rdi)
-; AVX1-NEXT:  LBB16_2: ## %else
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB16_4
-; AVX1-NEXT:  ## BB#3: ## %cond.store1
-; AVX1-NEXT:    vextractps $1, %xmm1, 4(%rdi)
-; AVX1-NEXT:  LBB16_4: ## %else2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test14:
@@ -732,15 +692,15 @@ define void @test14(<2 x i32> %trigger,
 ; AVX2-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test14:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test14:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test14:
 ; SKX:       ## BB#0:
@@ -760,22 +720,12 @@ define void @test15(<2 x i32> %trigger,
 ; AVX1-LABEL: test15:
 ; AVX1:       ## BB#0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrb $0, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB17_2
-; AVX1-NEXT:  ## BB#1: ## %cond.store
-; AVX1-NEXT:    vmovd %xmm1, (%rdi)
-; AVX1-NEXT:  LBB17_2: ## %else
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB17_4
-; AVX1-NEXT:  ## BB#3: ## %cond.store1
-; AVX1-NEXT:    vpextrd $2, %xmm1, 4(%rdi)
-; AVX1-NEXT:  LBB17_4: ## %else2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test15:
@@ -789,16 +739,16 @@ define void @test15(<2 x i32> %trigger,
 ; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test15:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test15:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test15:
 ; SKX:       ## BB#0:
@@ -815,29 +765,12 @@ define void @test15(<2 x i32> %trigger,
 define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
 ; AVX1-LABEL: test16:
 ; AVX1:       ## BB#0:
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX1-NEXT:    ## implicit-def: %XMM2
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB18_2
-; AVX1-NEXT:  ## BB#1: ## %cond.load
-; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX1-NEXT:  LBB18_2: ## %else
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB18_4
-; AVX1-NEXT:  ## BB#3: ## %cond.load1
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; AVX1-NEXT:  LBB18_4: ## %else2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -852,16 +785,16 @@ define <2 x float> @test16(<2 x i32> %tr
 ; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX512-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test16:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test16:
 ; SKX:       ## BB#0:
@@ -881,29 +814,15 @@ define <2 x float> @test16(<2 x i32> %tr
 define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
 ; AVX1-LABEL: test17:
 ; AVX1:       ## BB#0:
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX1-NEXT:    ## implicit-def: %XMM2
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB19_2
-; AVX1-NEXT:  ## BB#1: ## %cond.load
-; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX1-NEXT:  LBB19_2: ## %else
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB19_4
-; AVX1-NEXT:  ## BB#3: ## %cond.load1
-; AVX1-NEXT:    movl 4(%rdi), %eax
-; AVX1-NEXT:    vpinsrq $1, %rax, %xmm2, %xmm2
-; AVX1-NEXT:  LBB19_4: ## %else2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test17:
@@ -919,18 +838,18 @@ define <2 x i32> @test17(<2 x i32> %trig
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test17:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test17:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test17:
 ; SKX:       ## BB#0:
@@ -951,30 +870,12 @@ define <2 x i32> @test17(<2 x i32> %trig
 define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
 ; AVX1-LABEL: test18:
 ; AVX1:       ## BB#0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpextrb $0, %xmm1, %eax
-; AVX1-NEXT:    ## implicit-def: %XMM1
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB20_2
-; AVX1-NEXT:  ## BB#1: ## %cond.load
-; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:  LBB20_2: ## %else
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    je LBB20_4
-; AVX1-NEXT:  ## BB#3: ## %cond.load1
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; AVX1-NEXT:  LBB20_4: ## %else2
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test18:
@@ -987,15 +888,15 @@ define <2 x float> @test18(<2 x i32> %tr
 ; AVX2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test18:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test18:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512F-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test18:
 ; SKX:       ## BB#0:
@@ -1012,22 +913,17 @@ define <2 x float> @test18(<2 x i32> %tr
 }
 
 define <4 x float> @test19(<4 x i32> %trigger, <4 x float>* %addr) {
-; AVX1-LABEL: test19:
-; AVX1:       ## BB#0:
-; AVX1-NEXT:    vmovups (%rdi), %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test19:
-; AVX2:       ## BB#0:
-; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test19:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: test19:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: test19:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test19:
 ; SKX:       ## BB#0:
@@ -1040,24 +936,19 @@ define <4 x float> @test19(<4 x i32> %tr
 }
 
 define <4 x float> @test20(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %src0) {
-; AVX1-LABEL: test20:
-; AVX1:       ## BB#0:
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm1[1],mem[2,3]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test20:
-; AVX2:       ## BB#0:
-; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,0,4294967295,4294967295]
-; AVX2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test20:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,0,4294967295,4294967295]
-; AVX512-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX512-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: test20:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,0,4294967295,4294967295]
+; AVX-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: test20:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,0,4294967295,4294967295]
+; AVX512F-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test20:
 ; SKX:       ## BB#0:
@@ -1074,7 +965,8 @@ define <4 x float> @test20(<4 x i32> %tr
 define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
 ; AVX1-LABEL: test21:
 ; AVX1:       ## BB#0:
-; AVX1-NEXT:    vmovups %xmm1, (%rdi)
+; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test21:
@@ -1083,11 +975,11 @@ define void @test21(<4 x i32> %trigger,
 ; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test21:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test21:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test21:
 ; SKX:       ## BB#0:
@@ -1102,7 +994,9 @@ define void @test21(<4 x i32> %trigger,
 define void @test22(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
 ; AVX1-LABEL: test22:
 ; AVX1:       ## BB#0:
-; AVX1-NEXT:    vmovd %xmm1, (%rdi)
+; AVX1-NEXT:    movl $-1, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test22:
@@ -1112,12 +1006,12 @@ define void @test22(<4 x i32> %trigger,
 ; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test22:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    movl $-1, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm0
-; AVX512-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test22:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    movl $-1, %eax
+; AVX512F-NEXT:    vmovd %eax, %xmm0
+; AVX512F-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test22:
 ; SKX:       ## BB#0:
@@ -1155,9 +1049,30 @@ declare void @llvm.masked.store.v2i64(<2
 declare <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>)
 
 define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) {
-; Bypassing exact checking here because it's over 700 lines.
 ; AVX1-LABEL: test23:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpeqq %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vmaskmovpd 96(%rdi), %ymm3, %ymm3
+; AVX1-NEXT:    vmaskmovpd 64(%rdi), %ymm2, %ymm2
+; AVX1-NEXT:    vmaskmovpd 32(%rdi), %ymm1, %ymm1
+; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test23:
 ; AVX2:       ## BB#0:
@@ -1180,15 +1095,6 @@ define <16 x i32*> @test23(<16 x i32*> %
 ; AVX512-NEXT:    vmovdqu64 64(%rdi), %zmm1 {%k2} {z}
 ; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z}
 ; AVX512-NEXT:    retq
-;
-; SKX-LABEL: test23:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
-; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k2
-; SKX-NEXT:    vmovdqu64 64(%rdi), %zmm1 {%k2} {z}
-; SKX-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z}
-; SKX-NEXT:    retq
   %mask = icmp eq <16 x i32*> %trigger, zeroinitializer
   %res = call <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer)
   ret <16 x i32*> %res
@@ -1199,9 +1105,45 @@ define <16 x i32*> @test23(<16 x i32*> %
 declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>)
 
 define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
-; Bypassing exact checking here because it's over 100 lines.
 ; AVX1-LABEL: test24:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm1, %ymm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmaskmovpd 96(%rdi), %ymm1, %ymm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmaskmovpd 64(%rdi), %ymm1, %ymm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmaskmovpd 32(%rdi), %ymm0, %ymm1
+; AVX1-NEXT:    vmovapd %ymm4, %ymm0
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test24:
 ; AVX2:       ## BB#0:
@@ -1231,15 +1173,15 @@ define <16 x %mystruct*> @test24(<16 x i
 ; AVX2-NEXT:    vmovdqa %ymm4, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test24:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z}
-; AVX512-NEXT:    kshiftrw $8, %k1, %k1
-; AVX512-NEXT:    vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test24:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test24:
 ; SKX:       ## BB#0:
@@ -1254,9 +1196,45 @@ define <16 x %mystruct*> @test24(<16 x i
 }
 
 define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
-; Bypassing exact checking here because it's over 100 lines.
 ; AVX1-LABEL: test_store_16i64:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrad $31, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT:    vmaskmovpd %ymm1, %ymm5, (%rdi)
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT:    vmaskmovpd %ymm4, %ymm1, 96(%rdi)
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT:    vmaskmovpd %ymm3, %ymm1, 64(%rdi)
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmaskmovpd %ymm2, %ymm0, 32(%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_store_16i64:
 ; AVX2:       ## BB#0:
@@ -1286,15 +1264,15 @@ define void @test_store_16i64(<16 x i64>
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_store_16i64:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT:    vmovdqu64 %zmm1, (%rdi) {%k1}
-; AVX512-NEXT:    kshiftrw $8, %k1, %k1
-; AVX512-NEXT:    vmovdqu64 %zmm2, 64(%rdi) {%k1}
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_store_16i64:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512F-NEXT:    vmovdqu64 %zmm2, 64(%rdi) {%k1}
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test_store_16i64:
 ; SKX:       ## BB#0:
@@ -1310,9 +1288,45 @@ define void @test_store_16i64(<16 x i64>
 declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
 
 define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
-; Bypassing exact checking here because it's over 100 lines.
 ; AVX1-LABEL: test_store_16f64:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrad $31, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT:    vmaskmovpd %ymm1, %ymm5, (%rdi)
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT:    vmaskmovpd %ymm4, %ymm1, 96(%rdi)
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT:    vmaskmovpd %ymm3, %ymm1, 64(%rdi)
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmaskmovpd %ymm2, %ymm0, 32(%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_store_16f64:
 ; AVX2:       ## BB#0:
@@ -1342,15 +1356,15 @@ define void @test_store_16f64(<16 x doub
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_store_16f64:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT:    vmovupd %zmm1, (%rdi) {%k1}
-; AVX512-NEXT:    kshiftrw $8, %k1, %k1
-; AVX512-NEXT:    vmovupd %zmm2, 64(%rdi) {%k1}
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_store_16f64:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vmovupd %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512F-NEXT:    vmovupd %zmm2, 64(%rdi) {%k1}
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test_store_16f64:
 ; SKX:       ## BB#0:
@@ -1366,9 +1380,49 @@ define void @test_store_16f64(<16 x doub
 declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
 
 define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
-; Bypassing exact checking here because it's over 100 lines.
 ; AVX1-LABEL: test_load_16i64:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrad $31, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm5, %ymm6
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm6, %ymm1, %ymm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm6, %ymm1
+; AVX1-NEXT:    vmaskmovpd 32(%rdi), %ymm1, %ymm6
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm6, %ymm2, %ymm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm2
+; AVX1-NEXT:    vmaskmovpd 64(%rdi), %ymm2, %ymm6
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm6, %ymm3, %ymm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vmaskmovpd 96(%rdi), %ymm0, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm3, %ymm4, %ymm3
+; AVX1-NEXT:    vmovapd %ymm5, %ymm0
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_load_16i64:
 ; AVX2:       ## BB#0:
@@ -1402,17 +1456,17 @@ define <16 x i64> @test_load_16i64(<16 x
 ; AVX2-NEXT:    vmovapd %ymm5, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_load_16i64:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm1 {%k1}
-; AVX512-NEXT:    kshiftrw $8, %k1, %k1
-; AVX512-NEXT:    vmovdqu64 64(%rdi), %zmm2 {%k1}
-; AVX512-NEXT:    vmovaps %zmm1, %zmm0
-; AVX512-NEXT:    vmovaps %zmm2, %zmm1
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_load_16i64:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm2 {%k1}
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    vmovaps %zmm2, %zmm1
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test_load_16i64:
 ; SKX:       ## BB#0:
@@ -1430,9 +1484,49 @@ define <16 x i64> @test_load_16i64(<16 x
 declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
 
 define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
-; Bypassing exact checking here because it's over 100 lines.
 ; AVX1-LABEL: test_load_16f64:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrad $31, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT:    vmaskmovpd (%rdi), %ymm5, %ymm6
+; AVX1-NEXT:    vblendvpd %ymm5, %ymm6, %ymm1, %ymm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm6, %ymm1
+; AVX1-NEXT:    vmaskmovpd 32(%rdi), %ymm1, %ymm6
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm6, %ymm2, %ymm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm2
+; AVX1-NEXT:    vmaskmovpd 64(%rdi), %ymm2, %ymm6
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm6, %ymm3, %ymm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vmaskmovpd 96(%rdi), %ymm0, %ymm3
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm3, %ymm4, %ymm3
+; AVX1-NEXT:    vmovapd %ymm5, %ymm0
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_load_16f64:
 ; AVX2:       ## BB#0:
@@ -1466,17 +1560,17 @@ define <16 x double> @test_load_16f64(<1
 ; AVX2-NEXT:    vmovapd %ymm5, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_load_16f64:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT:    vmovupd (%rdi), %zmm1 {%k1}
-; AVX512-NEXT:    kshiftrw $8, %k1, %k1
-; AVX512-NEXT:    vmovupd 64(%rdi), %zmm2 {%k1}
-; AVX512-NEXT:    vmovaps %zmm1, %zmm0
-; AVX512-NEXT:    vmovaps %zmm2, %zmm1
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_load_16f64:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vmovupd (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512F-NEXT:    vmovupd 64(%rdi), %zmm2 {%k1}
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    vmovaps %zmm2, %zmm1
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test_load_16f64:
 ; SKX:       ## BB#0:
@@ -1494,9 +1588,111 @@ define <16 x double> @test_load_16f64(<1
 declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
 
 define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0)  {
-; Bypassing exact checking here because it's over 300 lines.
 ; AVX1-LABEL: test_load_32f64:
-; AVX1-NOT:   maskmov
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:  Ltmp0:
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:  Ltmp1:
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    movq %rsp, %rbp
+; AVX1-NEXT:  Ltmp2:
+; AVX1-NEXT:    .cfi_def_cfa_register %rbp
+; AVX1-NEXT:    andq $-32, %rsp
+; AVX1-NEXT:    subq $32, %rsp
+; AVX1-NEXT:    vmovapd 16(%rbp), %ymm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm9, %xmm9
+; AVX1-NEXT:    vpsrad $31, %xmm9, %xmm9
+; AVX1-NEXT:    vpmovsxdq %xmm9, %xmm10
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm9, %xmm9
+; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm10, %ymm9
+; AVX1-NEXT:    vmaskmovpd 32(%rsi), %ymm9, %ymm10
+; AVX1-NEXT:    vblendvpd %ymm9, %ymm10, %ymm2, %ymm9
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm10
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm10, %ymm2
+; AVX1-NEXT:    vmaskmovpd 64(%rsi), %ymm2, %ymm10
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm10, %ymm3, %ymm11
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm10
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm10, %ymm2
+; AVX1-NEXT:    vmaskmovpd 96(%rsi), %ymm2, %ymm10
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm10, %ymm4, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm10
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm10, %ymm3
+; AVX1-NEXT:    vmaskmovpd 160(%rsi), %ymm3, %ymm10
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm10, %ymm6, %ymm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm10
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm10, %ymm3
+; AVX1-NEXT:    vmaskmovpd 192(%rsi), %ymm3, %ymm10
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm10, %ymm7, %ymm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm10
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm10, %ymm3
+; AVX1-NEXT:    vmaskmovpd 224(%rsi), %ymm3, %ymm10
+; AVX1-NEXT:    vblendvpd %ymm3, %ymm10, %ymm8, %ymm3
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm8, %ymm0
+; AVX1-NEXT:    vmaskmovpd (%rsi), %ymm0, %ymm8
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm8, %ymm1, %ymm0
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmaskmovpd 128(%rsi), %ymm1, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm5, %ymm1
+; AVX1-NEXT:    vmovapd %ymm1, 128(%rdi)
+; AVX1-NEXT:    vmovapd %ymm0, (%rdi)
+; AVX1-NEXT:    vmovapd %ymm3, 224(%rdi)
+; AVX1-NEXT:    vmovapd %ymm7, 192(%rdi)
+; AVX1-NEXT:    vmovapd %ymm6, 160(%rdi)
+; AVX1-NEXT:    vmovapd %ymm4, 96(%rdi)
+; AVX1-NEXT:    vmovapd %ymm11, 64(%rdi)
+; AVX1-NEXT:    vmovapd %ymm9, 32(%rdi)
+; AVX1-NEXT:    movq %rdi, %rax
+; AVX1-NEXT:    movq %rbp, %rsp
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_load_32f64:
 ; AVX2:       ## BB#0:
@@ -1580,26 +1776,26 @@ define <32 x double> @test_load_32f64(<3
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_load_32f64:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX512-NEXT:    vpmovsxbd %xmm5, %zmm5
-; AVX512-NEXT:    vpslld $31, %zmm5, %zmm5
-; AVX512-NEXT:    vptestmd %zmm5, %zmm5, %k1
-; AVX512-NEXT:    vmovupd 128(%rdi), %zmm3 {%k1}
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k2
-; AVX512-NEXT:    vmovupd (%rdi), %zmm1 {%k2}
-; AVX512-NEXT:    kshiftrw $8, %k1, %k1
-; AVX512-NEXT:    vmovupd 192(%rdi), %zmm4 {%k1}
-; AVX512-NEXT:    kshiftrw $8, %k2, %k1
-; AVX512-NEXT:    vmovupd 64(%rdi), %zmm2 {%k1}
-; AVX512-NEXT:    vmovaps %zmm1, %zmm0
-; AVX512-NEXT:    vmovaps %zmm2, %zmm1
-; AVX512-NEXT:    vmovaps %zmm3, %zmm2
-; AVX512-NEXT:    vmovaps %zmm4, %zmm3
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_load_32f64:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX512F-NEXT:    vpmovsxbd %xmm5, %zmm5
+; AVX512F-NEXT:    vpslld $31, %zmm5, %zmm5
+; AVX512F-NEXT:    vptestmd %zmm5, %zmm5, %k1
+; AVX512F-NEXT:    vmovupd 128(%rdi), %zmm3 {%k1}
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
+; AVX512F-NEXT:    vmovupd (%rdi), %zmm1 {%k2}
+; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512F-NEXT:    vmovupd 192(%rdi), %zmm4 {%k1}
+; AVX512F-NEXT:    kshiftrw $8, %k2, %k1
+; AVX512F-NEXT:    vmovupd 64(%rdi), %zmm2 {%k1}
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    vmovaps %zmm2, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm3, %zmm2
+; AVX512F-NEXT:    vmovaps %zmm4, %zmm3
+; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test_load_32f64:
 ; SKX:       ## BB#0:

Modified: llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll?rev=258675&r1=258674&r2=258675&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll Mon Jan 25 04:17:11 2016
@@ -1,9 +1,7 @@
-; RUN: opt < %s  -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX1
-; RUN: opt < %s  -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX2
+; RUN: opt < %s  -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX1
+; RUN: opt < %s  -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX2
 ; RUN: opt < %s  -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
 
-;AVX1-NOT: llvm.masked
-
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc_linux"
 
@@ -18,12 +16,12 @@ target triple = "x86_64-pc_linux"
 ;  }
 ;}
 
-;AVX2-LABEL: @foo1
-;AVX2: icmp slt <8 x i32> %wide.load, <i32 100, i32 100, i32 100
-;AVX2: call <8 x i32> @llvm.masked.load.v8i32
-;AVX2: add nsw <8 x i32>
-;AVX2: call void @llvm.masked.store.v8i32
-;AVX2: ret void
+;AVX-LABEL: @foo1
+;AVX: icmp slt <8 x i32> %wide.load, <i32 100, i32 100, i32 100
+;AVX: call <8 x i32> @llvm.masked.load.v8i32
+;AVX: add nsw <8 x i32>
+;AVX: call void @llvm.masked.store.v8i32
+;AVX: ret void
 
 ;AVX512-LABEL: @foo1
 ;AVX512: icmp slt <16 x i32> %wide.load, <i32 100, i32 100, i32 100
@@ -102,12 +100,12 @@ for.end:
 ;  }
 ;}
 
-;AVX2-LABEL: @foo2
-;AVX2: icmp slt <8 x i32> %wide.load, <i32 100, i32 100, i32 100
-;AVX2: call <8 x float> @llvm.masked.load.v8f32
-;AVX2: fadd <8 x float>
-;AVX2: call void @llvm.masked.store.v8f32
-;AVX2: ret void
+;AVX-LABEL: @foo2
+;AVX: icmp slt <8 x i32> %wide.load, <i32 100, i32 100, i32 100
+;AVX: call <8 x float> @llvm.masked.load.v8f32
+;AVX: fadd <8 x float>
+;AVX: call void @llvm.masked.store.v8f32
+;AVX: ret void
 
 ;AVX512-LABEL: @foo2
 ;AVX512: icmp slt <16 x i32> %wide.load, <i32 100, i32 100, i32 100
@@ -187,13 +185,13 @@ for.end:
 ;  }
 ;}
 
-;AVX2-LABEL: @foo3
-;AVX2: icmp slt <4 x i32> %wide.load, <i32 100, i32 100,
-;AVX2: call <4 x double> @llvm.masked.load.v4f64
-;AVX2: sitofp <4 x i32> %wide.load to <4 x double>
-;AVX2: fadd <4 x double>
-;AVX2: call void @llvm.masked.store.v4f64
-;AVX2: ret void
+;AVX-LABEL: @foo3
+;AVX: icmp slt <4 x i32> %wide.load, <i32 100, i32 100,
+;AVX: call <4 x double> @llvm.masked.load.v4f64
+;AVX: sitofp <4 x i32> %wide.load to <4 x double>
+;AVX: fadd <4 x double>
+;AVX: call void @llvm.masked.store.v4f64
+;AVX: ret void
 
 ;AVX512-LABEL: @foo3
 ;AVX512: icmp slt <8 x i32> %wide.load, <i32 100, i32 100,
@@ -275,9 +273,9 @@ for.end:
 ;  }
 ;}
 
-;AVX2-LABEL: @foo4
-;AVX2-NOT: llvm.masked
-;AVX2: ret void
+;AVX-LABEL: @foo4
+;AVX-NOT: llvm.masked
+;AVX: ret void
 
 ;AVX512-LABEL: @foo4
 ;AVX512-NOT: llvm.masked
@@ -349,10 +347,10 @@ for.end:
 
 ; The loop here should not be vectorized due to trapping
 ; constant expression
-;AVX2-LABEL: @foo5
-;AVX2-NOT: llvm.masked
-;AVX2: store i32 sdiv
-;AVX2: ret void
+;AVX-LABEL: @foo5
+;AVX-NOT: llvm.masked
+;AVX: store i32 sdiv
+;AVX: ret void
 
 ;AVX512-LABEL: @foo5
 ;AVX512-NOT: llvm.masked




More information about the llvm-commits mailing list