[llvm] r308868 - [X86][AVX512] Add patterns for masked AVX512 floating point compare instructions that were missing.

Ayman Musa via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 24 01:10:32 PDT 2017


Author: aymanmus
Date: Mon Jul 24 01:10:32 2017
New Revision: 308868

URL: http://llvm.org/viewvc/llvm-project?rev=308868&view=rev
Log:
[X86][AVX512] Add patterns for masked AVX512 floating point compare instructions that were missing.

patterns were missed by D33188. Adding for completion.
+Updating test.

Differential Revesion: https://reviews.llvm.org/D35179


Modified:
    llvm/trunk/lib/Target/X86/X86InstrAVX512.td
    llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll

Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=308868&r1=308867&r2=308868&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Mon Jul 24 01:10:32 2017
@@ -2457,7 +2457,19 @@ let Predicates = Preds in {
                               NewInf.KRC)>;
 
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (X86cmpm (_.VT _.RC:$src1),
+                              (_.KVT (and _.KRCWM:$mask, 
+                                          (X86cmpm (_.VT _.RC:$src1), 
+                                                   (_.VT _.RC:$src2), 
+                                                    imm:$cc))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrik) _.KRCWM:$mask,
+                                                                  _.RC:$src1, 
+                                                                  _.RC:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1), 
                                               (_.VT (bitconvert (_.LdFrag addr:$src2))),
                                               imm:$cc)),
                               (i64 0)),
@@ -2467,6 +2479,19 @@ let Predicates = Preds in {
                               NewInf.KRC)>;
 
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and _.KRCWM:$mask, 
+                                         (X86cmpm (_.VT _.RC:$src1), 
+                                                  (_.VT (bitconvert 
+                                                         (_.LdFrag addr:$src2))),
+                                                  imm:$cc))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask,
+                                                                  _.RC:$src1,
+                                                                  addr:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
                               (_.KVT (X86cmpm (_.VT _.RC:$src1),
                                               (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
                                               imm:$cc)),
@@ -2475,6 +2500,19 @@ let Predicates = Preds in {
                                                                   addr:$src2,
                                                                   imm:$cc),
                               NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and _.KRCWM:$mask, 
+                                          (X86cmpm (_.VT _.RC:$src1),
+                                                   (X86VBroadcast 
+                                                    (_.ScalarLdFrag addr:$src2)),
+                                                   imm:$cc))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbik) _.KRCWM:$mask,
+                                                                   _.RC:$src1,
+                                                                   addr:$src2,
+                                                                   imm:$cc),
+                              NewInf.KRC)>;
 }
 }
 
@@ -2493,6 +2531,19 @@ let Predicates = Preds in
                                                                  _.RC:$src2,
                                                                  imm:$cc),
                               NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and _.KRCWM:$mask, 
+                                          (X86cmpmRnd (_.VT _.RC:$src1), 
+                                                      (_.VT _.RC:$src2), 
+                                                      imm:$cc,
+                                                      (i32 FROUND_NO_EXC)))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rribk) _.KRCWM:$mask, 
+                                                                   _.RC:$src1, 
+                                                                   _.RC:$src2,
+                                                                   imm:$cc),
+                              NewInf.KRC)>;
 }
 
 

Modified: llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll?rev=308868&r1=308867&r2=308868&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll Mon Jul 24 01:10:32 2017
@@ -51802,6 +51802,206 @@ entry:
   ret i8 %4
 }
 
+define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqps %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm2, %ymm2
+; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <4 x float>
+  %1 = bitcast <2 x i64> %__b to <4 x float>
+  %2 = fcmp oeq <4 x float> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <8 x i1> %5 to i8
+  ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqps (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <4 x float>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <4 x float>
+  %2 = fcmp oeq <4 x float> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <8 x i1> %5 to i8
+  ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vbroadcastss (%rsi), %xmm2
+; NoVLX-NEXT:    vcmpeqps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <4 x float>
+  %load = load float, float* %__b
+  %vec = insertelement <4 x float> undef, float %load, i32 0
+  %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <4 x float> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <8 x i1> %5 to i8
+  ret i8 %6
+}
+
+
 
 define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
 ; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask:
@@ -51969,90 +52169,287 @@ entry:
   ret i16 %4
 }
 
-
-define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
+define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqps %xmm1, %xmm0, %k0 {%k1}
 ; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1808:
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1809:
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1810:
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm2, %ymm2
 ; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x float>
   %1 = bitcast <2 x i64> %__b to <4 x float>
   %2 = fcmp oeq <4 x float> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-  %4 = bitcast <32 x i1> %3 to i32
-  ret i32 %4
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
 }
 
-define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps (%rdi), %xmm0, %k0
+define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqps (%rsi), %xmm0, %k0 {%k1}
 ; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1811:
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1812:
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1813:
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
-; NoVLX-NEXT:    retq
-entry:
-  %0 = bitcast <2 x i64> %__a to <4 x float>
-  %load = load <2 x i64>, <2 x i64>* %__b
-  %1 = bitcast <2 x i64> %load to <4 x float>
-  %2 = fcmp oeq <4 x float> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <4 x float>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <4 x float>
+  %2 = fcmp oeq <4 x float> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vbroadcastss (%rsi), %xmm2
+; NoVLX-NEXT:    vcmpeqps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <4 x float>
+  %load = load float, float* %__b
+  %vec = insertelement <4 x float> undef, float %load, i32 0
+  %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <4 x float> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
+}
+
+
+
+define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1808:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1809:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1810:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <4 x float>
+  %1 = bitcast <2 x i64> %__b to <4 x float>
+  %2 = fcmp oeq <4 x float> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %4 = bitcast <32 x i1> %3 to i32
+  ret i32 %4
+}
+
+define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqps (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1811:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1812:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1813:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <4 x float>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <4 x float>
+  %2 = fcmp oeq <4 x float> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
   %4 = bitcast <32 x i1> %3 to i32
   ret i32 %4
 }
@@ -52103,15 +52500,16 @@ entry:
   ret i32 %4
 }
 
-
-define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
+define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps %xmm1, %xmm0, %k0
-; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqps %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
 ; NoVLX-NEXT:  .Lcfi1817:
@@ -52123,11 +52521,13 @@ define zeroext i64 @test_vcmpoeqps_v4i1_
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm2, %ymm2
 ; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vandps %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -52137,11 +52537,8 @@ define zeroext i64 @test_vcmpoeqps_v4i1_
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
@@ -52149,19 +52546,23 @@ entry:
   %0 = bitcast <2 x i64> %__a to <4 x float>
   %1 = bitcast <2 x i64> %__b to <4 x float>
   %2 = fcmp oeq <4 x float> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-  %4 = bitcast <64 x i1> %3 to i64
-  ret i64 %4
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
 }
 
-define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
+define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps (%rdi), %xmm0, %k0
-; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqps (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
 ; NoVLX-NEXT:  .Lcfi1820:
@@ -52173,11 +52574,13 @@ define zeroext i64 @test_vcmpoeqps_v4i1_
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -52187,11 +52590,8 @@ define zeroext i64 @test_vcmpoeqps_v4i1_
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
@@ -52200,19 +52600,23 @@ entry:
   %load = load <2 x i64>, <2 x i64>* %__b
   %1 = bitcast <2 x i64> %load to <4 x float>
   %2 = fcmp oeq <4 x float> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-  %4 = bitcast <64 x i1> %3 to i64
-  ret i64 %4
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
 }
 
-define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
+define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps (%rdi){1to4}, %xmm0, %k0
-; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
 ; NoVLX-NEXT:  .Lcfi1823:
@@ -52224,12 +52628,14 @@ define zeroext i64 @test_vcmpoeqps_v4i1_
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vbroadcastss (%rdi), %xmm1
-; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vbroadcastss (%rsi), %xmm2
+; NoVLX-NEXT:    vcmpeqps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -52239,11 +52645,8 @@ define zeroext i64 @test_vcmpoeqps_v4i1_
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
@@ -52253,109 +52656,23 @@ entry:
   %vec = insertelement <4 x float> undef, float %load, i32 0
   %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
   %2 = fcmp oeq <4 x float> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-  %4 = bitcast <64 x i1> %3 to i64
-  ret i64 %4
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
 }
 
 
-define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
+
+define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps %ymm1, %ymm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; VLX-NEXT:    vzeroupper
-; VLX-NEXT:    retq
-;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
-; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; NoVLX-NEXT:    retq
-entry:
-  %0 = bitcast <4 x i64> %__a to <8 x float>
-  %1 = bitcast <4 x i64> %__b to <8 x float>
-  %2 = fcmp oeq <8 x float> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <16 x i1> %3 to i16
-  ret i16 %4
-}
-
-define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps (%rdi), %ymm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; VLX-NEXT:    vzeroupper
-; VLX-NEXT:    retq
-;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
-; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT:    vmovaps (%rdi), %ymm1
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; NoVLX-NEXT:    retq
-entry:
-  %0 = bitcast <4 x i64> %__a to <8 x float>
-  %load = load <4 x i64>, <4 x i64>* %__b
-  %1 = bitcast <4 x i64> %load to <8 x float>
-  %2 = fcmp oeq <8 x float> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <16 x i1> %3 to i16
-  ret i16 %4
-}
-
-define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps (%rdi){1to8}, %ymm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; VLX-NEXT:    vzeroupper
-; VLX-NEXT:    retq
-;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
-; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT:    vbroadcastss (%rdi), %ymm1
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; NoVLX-NEXT:    retq
-entry:
-  %0 = bitcast <4 x i64> %__a to <8 x float>
-  %load = load float, float* %__b
-  %vec = insertelement <8 x float> undef, float %load, i32 0
-  %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %2 = fcmp oeq <8 x float> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <16 x i1> %3 to i16
-  ret i16 %4
-}
-
-
-define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps %ymm1, %ymm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
 ; NoVLX-NEXT:  .Lcfi1826:
@@ -52366,72 +52683,46 @@ define zeroext i32 @test_vcmpoeqps_v8i1_
 ; NoVLX-NEXT:  .Lcfi1828:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
 ; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <8 x float>
-  %1 = bitcast <4 x i64> %__b to <8 x float>
-  %2 = fcmp oeq <8 x float> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <32 x i1> %3 to i32
-  ret i32 %4
+  %0 = bitcast <2 x i64> %__a to <4 x float>
+  %1 = bitcast <2 x i64> %__b to <4 x float>
+  %2 = fcmp oeq <4 x float> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %4 = bitcast <64 x i1> %3 to i64
+  ret i64 %4
 }
 
-define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
+define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps (%rdi), %ymm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    vcmpeqps (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
 ; NoVLX-NEXT:  .Lcfi1829:
@@ -52442,73 +52733,47 @@ define zeroext i32 @test_vcmpoeqps_v8i1_
 ; NoVLX-NEXT:  .Lcfi1831:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT:    vmovaps (%rdi), %ymm1
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
 ; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <8 x float>
-  %load = load <4 x i64>, <4 x i64>* %__b
-  %1 = bitcast <4 x i64> %load to <8 x float>
-  %2 = fcmp oeq <8 x float> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <32 x i1> %3 to i32
-  ret i32 %4
+  %0 = bitcast <2 x i64> %__a to <4 x float>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <4 x float>
+  %2 = fcmp oeq <4 x float> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %4 = bitcast <64 x i1> %3 to i64
+  ret i64 %4
 }
 
-define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
+define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps (%rdi){1to8}, %ymm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    vcmpeqps (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
 ; NoVLX-NEXT:  .Lcfi1832:
@@ -52519,75 +52784,51 @@ define zeroext i32 @test_vcmpoeqps_v8i1_
 ; NoVLX-NEXT:  .Lcfi1834:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT:    vbroadcastss (%rdi), %ymm1
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vbroadcastss (%rdi), %xmm1
+; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
 ; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %0 = bitcast <2 x i64> %__a to <4 x float>
   %load = load float, float* %__b
-  %vec = insertelement <8 x float> undef, float %load, i32 0
-  %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %2 = fcmp oeq <8 x float> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <32 x i1> %3 to i32
-  ret i32 %4
+  %vec = insertelement <4 x float> undef, float %load, i32 0
+  %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <4 x float> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %4 = bitcast <64 x i1> %3 to i64
+  ret i64 %4
 }
 
-
-define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
+define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps %ymm1, %ymm0, %k0
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqps %xmm1, %xmm0, %k0 {%k1}
 ; VLX-NEXT:    kmovq %k0, %rax
-; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
 ; NoVLX-NEXT:  .Lcfi1835:
@@ -52598,77 +52839,56 @@ define zeroext i64 @test_vcmpoeqps_v8i1_
 ; NoVLX-NEXT:  .Lcfi1837:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm2, %ymm2
+; NoVLX-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
 ; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    orq %rcx, %rax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <8 x float>
-  %1 = bitcast <4 x i64> %__b to <8 x float>
-  %2 = fcmp oeq <8 x float> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <64 x i1> %3 to i64
-  ret i64 %4
+  %0 = bitcast <2 x i64> %__a to <4 x float>
+  %1 = bitcast <2 x i64> %__b to <4 x float>
+  %2 = fcmp oeq <4 x float> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
 }
 
-define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps (%rdi), %ymm0, %k0
+define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqps (%rsi), %xmm0, %k0 {%k1}
 ; VLX-NEXT:    kmovq %k0, %rax
-; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
 ; NoVLX-NEXT:  .Lcfi1838:
@@ -52679,78 +52899,57 @@ define zeroext i64 @test_vcmpoeqps_v8i1_
 ; NoVLX-NEXT:  .Lcfi1840:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT:    vmovaps (%rdi), %ymm1
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT:    vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
 ; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    orq %rcx, %rax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <8 x float>
-  %load = load <4 x i64>, <4 x i64>* %__b
-  %1 = bitcast <4 x i64> %load to <8 x float>
-  %2 = fcmp oeq <8 x float> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <64 x i1> %3 to i64
-  ret i64 %4
+  %0 = bitcast <2 x i64> %__a to <4 x float>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <4 x float>
+  %2 = fcmp oeq <4 x float> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
 }
 
-define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps (%rdi){1to8}, %ymm0, %k0
+define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
 ; VLX-NEXT:    kmovq %k0, %rax
-; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
 ; NoVLX-NEXT:  .Lcfi1841:
@@ -52761,15 +52960,265 @@ define zeroext i64 @test_vcmpoeqps_v8i1_
 ; NoVLX-NEXT:  .Lcfi1843:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT:    vbroadcastss (%rsi), %xmm3
+; NoVLX-NEXT:    vcmpeqps %xmm3, %xmm0, %xmm0
+; NoVLX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <4 x float>
+  %load = load float, float* %__b
+  %vec = insertelement <4 x float> undef, float %load, i32 0
+  %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <4 x float> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
+}
+
+
+
+define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqps %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT:    vbroadcastss (%rdi), %ymm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %1 = bitcast <4 x i64> %__b to <8 x float>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %4 = bitcast <16 x i1> %3 to i16
+  ret i16 %4
+}
+
+define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqps (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovaps (%rdi), %ymm1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %load = load <4 x i64>, <4 x i64>* %__b
+  %1 = bitcast <4 x i64> %load to <8 x float>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %4 = bitcast <16 x i1> %3 to i16
+  ret i16 %4
+}
+
+define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqps (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vbroadcastss (%rdi), %ymm1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %load = load float, float* %__b
+  %vec = insertelement <8 x float> undef, float %load, i32 0
+  %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %4 = bitcast <16 x i1> %3 to i16
+  ret i16 %4
+}
+
+define zeroext i16 @test_masked_vcmpoeqps_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %1 = bitcast <4 x i64> %__b to <8 x float>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqps_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovaps (%rsi), %ymm1
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %load = load <4 x i64>, <4 x i64>* %__b
+  %1 = bitcast <4 x i64> %load to <8 x float>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqps_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vbroadcastss (%rsi), %ymm1
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %load = load float, float* %__b
+  %vec = insertelement <8 x float> undef, float %load, i32 0
+  %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
+}
+
+
+
+define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqps %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1844:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1845:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1846:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -52797,9 +53246,9 @@ define zeroext i64 @test_vcmpoeqps_v8i1_
 ; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
@@ -52807,656 +53256,483 @@ define zeroext i64 @test_vcmpoeqps_v8i1_
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x float>
-  %load = load float, float* %__b
-  %vec = insertelement <8 x float> undef, float %load, i32 0
-  %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %1 = bitcast <4 x i64> %__b to <8 x float>
   %2 = fcmp oeq <8 x float> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <64 x i1> %3 to i64
-  ret i64 %4
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %4 = bitcast <32 x i1> %3 to i32
+  ret i32 %4
 }
 
-
-define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; VLX-NEXT:    vcmpeqps (%rdi), %ymm0, %k0
 ; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1844:
+; NoVLX-NEXT:  .Lcfi1847:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1845:
+; NoVLX-NEXT:  .Lcfi1848:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1846:
+; NoVLX-NEXT:  .Lcfi1849:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1847:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1848:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1849:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1850:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1851:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovaps (%rdi), %ymm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %r9d
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %edx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
-; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <8 x i64> %__a to <16 x float>
-  %1 = bitcast <8 x i64> %__b to <16 x float>
-  %2 = fcmp oeq <16 x float> %0, %1
-  %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %load = load <4 x i64>, <4 x i64>* %__b
+  %1 = bitcast <4 x i64> %load to <8 x float>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %4 = bitcast <32 x i1> %3 to i32
   ret i32 %4
 }
 
-define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
+define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k0
+; VLX-NEXT:    vcmpeqps (%rdi){1to8}, %ymm0, %k0
 ; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1852:
+; NoVLX-NEXT:  .Lcfi1850:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1853:
+; NoVLX-NEXT:  .Lcfi1851:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1854:
+; NoVLX-NEXT:  .Lcfi1852:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1855:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1856:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1857:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1858:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1859:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
-; NoVLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vbroadcastss (%rdi), %ymm1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %r9d
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %edx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
-; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <8 x i64> %__a to <16 x float>
-  %load = load <8 x i64>, <8 x i64>* %__b
-  %1 = bitcast <8 x i64> %load to <16 x float>
-  %2 = fcmp oeq <16 x float> %0, %1
-  %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %load = load float, float* %__b
+  %vec = insertelement <8 x float> undef, float %load, i32 0
+  %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %4 = bitcast <32 x i1> %3 to i32
   ret i32 %4
 }
 
-define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps %ymm1, %ymm0, %k0 {%k1}
 ; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1860:
+; NoVLX-NEXT:  .Lcfi1853:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1861:
+; NoVLX-NEXT:  .Lcfi1854:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1862:
+; NoVLX-NEXT:  .Lcfi1855:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1863:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1864:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1865:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1866:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1867:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
-; NoVLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
 ; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %r9d
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %edx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
-; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <8 x i64> %__a to <16 x float>
-  %load = load float, float* %__b
-  %vec = insertelement <16 x float> undef, float %load, i32 0
-  %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %2 = fcmp oeq <16 x float> %0, %1
-  %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  %4 = bitcast <32 x i1> %3 to i32
-  ret i32 %4
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %1 = bitcast <4 x i64> %__b to <8 x float>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
 }
 
-
-define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
+define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
-; VLX-NEXT:    kmovw %k0, %eax
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1856:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1857:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1858:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovaps (%rsi), %ymm1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <8 x i64> %__a to <16 x float>
-  %1 = bitcast <8 x i64> %__b to <16 x float>
-  %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8)
-  %3 = zext i16 %2 to i32
-  ret i32 %3
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %load = load <4 x i64>, <4 x i64>* %__b
+  %1 = bitcast <4 x i64> %load to <8 x float>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
 }
 
-
-define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
+define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1868:
+; NoVLX-NEXT:  .Lcfi1859:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1869:
+; NoVLX-NEXT:  .Lcfi1860:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1870:
+; NoVLX-NEXT:  .Lcfi1861:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1871:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1872:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1873:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1874:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1875:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vbroadcastss (%rsi), %ymm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
 ; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %r9d
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %edx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
-; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
-; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
-; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
-; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <8 x i64> %__a to <16 x float>
-  %1 = bitcast <8 x i64> %__b to <16 x float>
-  %2 = fcmp oeq <16 x float> %0, %1
-  %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
-  %4 = bitcast <64 x i1> %3 to i64
-  ret i64 %4
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %load = load float, float* %__b
+  %vec = insertelement <8 x float> undef, float %load, i32 0
+  %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
 }
 
-define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
+
+
+define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k0
+; VLX-NEXT:    vcmpeqps %ymm1, %ymm0, %k0
 ; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1876:
+; NoVLX-NEXT:  .Lcfi1862:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1877:
+; NoVLX-NEXT:  .Lcfi1863:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1878:
+; NoVLX-NEXT:  .Lcfi1864:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1879:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1880:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1881:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1882:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1883:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
-; NoVLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %r9d
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %edx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %esi
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %edi
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
-; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -53465,128 +53741,161 @@ define zeroext i64 @test_vcmpoeqps_v16i1
 ; NoVLX-NEXT:    shlq $32, %rcx
 ; NoVLX-NEXT:    movl (%rsp), %eax
 ; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <8 x i64> %__a to <16 x float>
-  %load = load <8 x i64>, <8 x i64>* %__b
-  %1 = bitcast <8 x i64> %load to <16 x float>
-  %2 = fcmp oeq <16 x float> %0, %1
-  %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %1 = bitcast <4 x i64> %__b to <8 x float>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %4 = bitcast <64 x i1> %3 to i64
   ret i64 %4
 }
 
-define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT:    vcmpeqps (%rdi), %ymm0, %k0
 ; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1884:
+; NoVLX-NEXT:  .Lcfi1865:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1885:
+; NoVLX-NEXT:  .Lcfi1866:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1886:
+; NoVLX-NEXT:  .Lcfi1867:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1887:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1888:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1889:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1890:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1891:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
-; NoVLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovaps (%rdi), %ymm1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %r9d
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %edx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %esi
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %edi
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
-; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %load = load <4 x i64>, <4 x i64>* %__b
+  %1 = bitcast <4 x i64> %load to <8 x float>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %4 = bitcast <64 x i1> %3 to i64
+  ret i64 %4
+}
+
+define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqps (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1868:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1869:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1870:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vbroadcastss (%rdi), %ymm1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
-; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -53595,271 +53904,3881 @@ define zeroext i64 @test_vcmpoeqps_v16i1
 ; NoVLX-NEXT:    shlq $32, %rcx
 ; NoVLX-NEXT:    movl (%rsp), %eax
 ; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %0 = bitcast <4 x i64> %__a to <8 x float>
   %load = load float, float* %__b
-  %vec = insertelement <16 x float> undef, float %load, i32 0
-  %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %2 = fcmp oeq <16 x float> %0, %1
-  %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+  %vec = insertelement <8 x float> undef, float %load, i32 0
+  %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %4 = bitcast <64 x i1> %3 to i64
   ret i64 %4
 }
 
-
-define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
+define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    movzwl %ax, %eax
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
-; NoVLX-NEXT:    retq
-entry:
-  %0 = bitcast <8 x i64> %__a to <16 x float>
-  %1 = bitcast <8 x i64> %__b to <16 x float>
-  %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8)
-  %3 = zext i16 %2 to i64
-  ret i64 %3
-}
-
-
-declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
-define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
-; VLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1871:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1872:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1873:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %1 = bitcast <4 x i64> %__b to <8 x float>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1874:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1875:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1876:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovaps (%rsi), %ymm1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %load = load <4 x i64>, <4 x i64>* %__b
+  %1 = bitcast <4 x i64> %load to <8 x float>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1877:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1878:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1879:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vbroadcastss (%rsi), %ymm1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <8 x float>
+  %load = load float, float* %__b
+  %vec = insertelement <8 x float> undef, float %load, i32 0
+  %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <8 x float> %0, %1
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
+}
+
+
+
+define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1880:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1881:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1882:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi1883:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi1884:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi1885:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi1886:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi1887:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %1 = bitcast <8 x i64> %__b to <16 x float>
+  %2 = fcmp oeq <16 x float> %0, %1
+  %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %4 = bitcast <32 x i1> %3 to i32
+  ret i32 %4
+}
+
+define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1888:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1889:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1890:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi1891:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi1892:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi1893:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi1894:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi1895:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %load = load <8 x i64>, <8 x i64>* %__b
+  %1 = bitcast <8 x i64> %load to <16 x float>
+  %2 = fcmp oeq <16 x float> %0, %1
+  %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %4 = bitcast <32 x i1> %3 to i32
+  ret i32 %4
+}
+
+define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1896:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1897:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1898:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi1899:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi1900:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi1901:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi1902:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi1903:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %load = load float, float* %__b
+  %vec = insertelement <16 x float> undef, float %load, i32 0
+  %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <16 x float> %0, %1
+  %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %4 = bitcast <32 x i1> %3 to i32
+  ret i32 %4
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1904:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1905:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1906:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi1907:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi1908:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi1909:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi1910:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi1911:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %1 = bitcast <8 x i64> %__b to <16 x float>
+  %2 = fcmp oeq <16 x float> %0, %1
+  %3 = bitcast i16 %__u to <16 x i1>
+  %4 = and <16 x i1> %2, %3
+  %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1912:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1913:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1914:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi1915:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi1916:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi1917:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi1918:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi1919:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %load = load <8 x i64>, <8 x i64>* %__b
+  %1 = bitcast <8 x i64> %load to <16 x float>
+  %2 = fcmp oeq <16 x float> %0, %1
+  %3 = bitcast i16 %__u to <16 x i1>
+  %4 = and <16 x i1> %2, %3
+  %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1920:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1921:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1922:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi1923:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi1924:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi1925:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi1926:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi1927:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %load = load float, float* %__b
+  %vec = insertelement <16 x float> undef, float %load, i32 0
+  %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <16 x float> %0, %1
+  %3 = bitcast i16 %__u to <16 x i1>
+  %4 = and <16 x i1> %2, %3
+  %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
+}
+
+
+
+define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovw %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %1 = bitcast <8 x i64> %__b to <16 x float>
+  %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8)
+  %3 = zext i16 %2 to i32
+  ret i32 %3
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovw %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %1 = bitcast <8 x i64> %__b to <16 x float>
+  %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 %__u, i32 8)
+  %3 = zext i16 %2 to i32
+  ret i32 %3
+}
+
+
+
+define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1928:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1929:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1930:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi1931:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi1932:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi1933:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi1934:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi1935:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %1 = bitcast <8 x i64> %__b to <16 x float>
+  %2 = fcmp oeq <16 x float> %0, %1
+  %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+  %4 = bitcast <64 x i1> %3 to i64
+  ret i64 %4
+}
+
+define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1936:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1937:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1938:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi1939:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi1940:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi1941:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi1942:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi1943:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %load = load <8 x i64>, <8 x i64>* %__b
+  %1 = bitcast <8 x i64> %load to <16 x float>
+  %2 = fcmp oeq <16 x float> %0, %1
+  %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+  %4 = bitcast <64 x i1> %3 to i64
+  ret i64 %4
+}
+
+define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1944:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1945:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1946:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi1947:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi1948:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi1949:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi1950:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi1951:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %load = load float, float* %__b
+  %vec = insertelement <16 x float> undef, float %load, i32 0
+  %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <16 x float> %0, %1
+  %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+  %4 = bitcast <64 x i1> %3 to i64
+  ret i64 %4
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1952:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1953:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1954:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi1955:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi1956:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi1957:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi1958:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi1959:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %1 = bitcast <8 x i64> %__b to <16 x float>
+  %2 = fcmp oeq <16 x float> %0, %1
+  %3 = bitcast i16 %__u to <16 x i1>
+  %4 = and <16 x i1> %2, %3
+  %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1960:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1961:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1962:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi1963:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi1964:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi1965:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi1966:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi1967:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %load = load <8 x i64>, <8 x i64>* %__b
+  %1 = bitcast <8 x i64> %load to <16 x float>
+  %2 = fcmp oeq <16 x float> %0, %1
+  %3 = bitcast i16 %__u to <16 x i1>
+  %4 = and <16 x i1> %2, %3
+  %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1968:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1969:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1970:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi1971:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi1972:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi1973:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi1974:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi1975:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %load = load float, float* %__b
+  %vec = insertelement <16 x float> undef, float %load, i32 0
+  %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <16 x float> %0, %1
+  %3 = bitcast i16 %__u to <16 x i1>
+  %4 = and <16 x i1> %2, %3
+  %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
+}
+
+
+
+define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    movzwl %ax, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %1 = bitcast <8 x i64> %__b to <16 x float>
+  %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8)
+  %3 = zext i16 %2 to i64
+  ret i64 %3
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    movzwl %ax, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <16 x float>
+  %1 = bitcast <8 x i64> %__b to <16 x float>
+  %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 %__u, i32 8)
+  %3 = zext i16 %2 to i64
+  ret i64 %3
+}
+
+
+
+declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
+define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %1 = bitcast <2 x i64> %__b to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = bitcast <4 x i1> %3 to i4
+  ret i4 %4
+}
+
+define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = bitcast <4 x i1> %3 to i4
+  ret i4 %4
+}
+
+define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <2 x double> undef, double %load, i32 0
+  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = bitcast <4 x i1> %3 to i4
+  ret i4 %4
+}
+
+define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %1 = bitcast <2 x i64> %__b to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = bitcast <4 x i1> %5 to i4
+  ret i4 %6
+}
+
+define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = bitcast <4 x i1> %5 to i4
+  ret i4 %6
+}
+
+define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; NoVLX-NEXT:    vcmpeqpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <2 x double> undef, double %load, i32 0
+  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = bitcast <4 x i1> %5 to i4
+  ret i4 %6
+}
+
+
+
+define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %1 = bitcast <2 x i64> %__b to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %4 = bitcast <8 x i1> %3 to i8
+  ret i8 %4
+}
+
+define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %4 = bitcast <8 x i1> %3 to i8
+  ret i8 %4
+}
+
+define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <2 x double> undef, double %load, i32 0
+  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %4 = bitcast <8 x i1> %3 to i8
+  ret i8 %4
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %1 = bitcast <2 x i64> %__b to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %6 = bitcast <8 x i1> %5 to i8
+  ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %6 = bitcast <8 x i1> %5 to i8
+  ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; NoVLX-NEXT:    vcmpeqpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <2 x double> undef, double %load, i32 0
+  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %6 = bitcast <8 x i1> %5 to i8
+  ret i8 %6
+}
+
+
+
+define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %1 = bitcast <2 x i64> %__b to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %4 = bitcast <16 x i1> %3 to i16
+  ret i16 %4
+}
+
+define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %4 = bitcast <16 x i1> %3 to i16
+  ret i16 %4
+}
+
+define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <2 x double> undef, double %load, i32 0
+  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %4 = bitcast <16 x i1> %3 to i16
+  ret i16 %4
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %1 = bitcast <2 x i64> %__b to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; NoVLX-NEXT:    vcmpeqpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <2 x double> undef, double %load, i32 0
+  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
+}
+
+
+
+define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1976:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1977:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1978:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %1 = bitcast <2 x i64> %__b to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %4 = bitcast <32 x i1> %3 to i32
+  ret i32 %4
+}
+
+define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1979:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1980:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1981:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %4 = bitcast <32 x i1> %3 to i32
+  ret i32 %4
+}
+
+define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1982:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1983:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1984:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <2 x double> undef, double %load, i32 0
+  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %4 = bitcast <32 x i1> %3 to i32
+  ret i32 %4
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1985:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1986:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1987:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %1 = bitcast <2 x i64> %__b to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1988:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1989:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1990:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1991:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1992:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1993:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; NoVLX-NEXT:    vcmpeqpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <2 x double> undef, double %load, i32 0
+  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
+}
+
+
+
+define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
 ; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0
-; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1994:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1995:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1996:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %1 = bitcast <2 x i64> %__b to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %4 = bitcast <64 x i1> %3 to i64
+  ret i64 %4
+}
+
+define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi1997:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1998:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi1999:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %4 = bitcast <64 x i1> %3 to i64
+  ret i64 %4
+}
+
+define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi2000:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi2001:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi2002:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <2 x double> undef, double %load, i32 0
+  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %4 = bitcast <64 x i1> %3 to i64
+  ret i64 %4
+}
+
+define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi2003:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi2004:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi2005:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %1 = bitcast <2 x i64> %__b to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi2006:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi2007:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi2008:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT:    vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load <2 x i64>, <2 x i64>* %__b
+  %1 = bitcast <2 x i64> %load to <2 x double>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi2009:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi2010:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi2011:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; NoVLX-NEXT:    vcmpeqpd %xmm3, %xmm0, %xmm0
+; NoVLX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <2 x double> undef, double %load, i32 0
+  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  %2 = fcmp oeq <2 x double> %0, %1
+  %3 = bitcast i2 %__u to <2 x i1>
+  %4 = and <2 x i1> %2, %3
+  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
+}
+
+
+
+define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %1 = bitcast <4 x i64> %__b to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %4 = bitcast <8 x i1> %3 to i8
+  ret i8 %4
+}
+
+define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %load = load <4 x i64>, <4 x i64>* %__b
+  %1 = bitcast <4 x i64> %load to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %4 = bitcast <8 x i1> %3 to i8
+  ret i8 %4
+}
+
+define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
-  %1 = bitcast <2 x i64> %__b to <2 x double>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %4 = bitcast <4 x i1> %3 to i4
-  ret i4 %4
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <4 x double> undef, double %load, i32 0
+  %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %4 = bitcast <8 x i1> %3 to i8
+  ret i8 %4
 }
 
-define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
+define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %k0
-; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm2, %ymm2
+; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %1 = bitcast <4 x i64> %__b to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <8 x i1> %5 to i8
+  ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vcmpeqpd (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
-  %load = load <2 x i64>, <2 x i64>* %__b
-  %1 = bitcast <2 x i64> %load to <2 x double>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %4 = bitcast <4 x i1> %3 to i4
-  ret i4 %4
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %load = load <4 x i64>, <4 x i64>* %__b
+  %1 = bitcast <4 x i64> %load to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <8 x i1> %5 to i8
+  ret i8 %6
 }
 
-define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vbroadcastsd (%rsi), %ymm2
+; NoVLX-NEXT:    vcmpeqpd %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %0 = bitcast <4 x i64> %__a to <4 x double>
   %load = load double, double* %__b
-  %vec = insertelement <2 x double> undef, double %load, i32 0
-  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %4 = bitcast <4 x i1> %3 to i4
-  ret i4 %4
+  %vec = insertelement <4 x double> undef, double %load, i32 0
+  %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <8 x i1> %5 to i8
+  ret i8 %6
 }
 
 
-define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
+
+define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k0
 ; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
 ; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
-  %1 = bitcast <2 x i64> %__b to <2 x double>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  %4 = bitcast <8 x i1> %3 to i8
-  ret i8 %4
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %1 = bitcast <4 x i64> %__b to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %4 = bitcast <16 x i1> %3 to i16
+  ret i16 %4
 }
 
-define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
+define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %k0
 ; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
 ; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
-  %load = load <2 x i64>, <2 x i64>* %__b
-  %1 = bitcast <2 x i64> %load to <2 x double>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  %4 = bitcast <8 x i1> %3 to i8
-  ret i8 %4
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %load = load <4 x i64>, <4 x i64>* %__b
+  %1 = bitcast <4 x i64> %load to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %4 = bitcast <16 x i1> %3 to i16
+  ret i16 %4
 }
 
-define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    vcmpeqpd (%rdi){1to4}, %ymm0, %k0
 ; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
 ; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %0 = bitcast <4 x i64> %__a to <4 x double>
   %load = load double, double* %__b
-  %vec = insertelement <2 x double> undef, double %load, i32 0
-  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  %4 = bitcast <8 x i1> %3 to i8
-  ret i8 %4
+  %vec = insertelement <4 x double> undef, double %load, i32 0
+  %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %4 = bitcast <16 x i1> %3 to i16
+  ret i16 %4
 }
 
-
-define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
+define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
 ; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm2, %ymm2
+; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
@@ -53869,35 +57788,63 @@ define zeroext i16 @test_vcmpoeqpd_v2i1_
 ; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
 ; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
 ; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
-  %1 = bitcast <2 x i64> %__b to <2 x double>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  %4 = bitcast <16 x i1> %3 to i16
-  ret i16 %4
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %1 = bitcast <4 x i64> %__b to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
 }
 
-define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %k0
+define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
 ; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vcmpeqpd (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
@@ -53907,37 +57854,65 @@ define zeroext i16 @test_vcmpoeqpd_v2i1_
 ; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
 ; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
 ; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
-  %load = load <2 x i64>, <2 x i64>* %__b
-  %1 = bitcast <2 x i64> %load to <2 x double>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  %4 = bitcast <16 x i1> %3 to i16
-  ret i16 %4
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %load = load <4 x i64>, <4 x i64>* %__b
+  %1 = bitcast <4 x i64> %load to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
 }
 
-define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
 ; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vbroadcastsd (%rsi), %ymm2
+; NoVLX-NEXT:    vcmpeqpd %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
@@ -53947,47 +57922,68 @@ define zeroext i16 @test_vcmpoeqpd_v2i1_
 ; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
 ; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
 ; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %0 = bitcast <4 x i64> %__a to <4 x double>
   %load = load double, double* %__b
-  %vec = insertelement <2 x double> undef, double %load, i32 0
-  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  %4 = bitcast <16 x i1> %3 to i16
-  ret i16 %4
+  %vec = insertelement <4 x double> undef, double %load, i32 0
+  %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
 }
 
 
-define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
+
+define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k0
 ; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1892:
+; NoVLX-NEXT:  .Lcfi2012:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1893:
+; NoVLX-NEXT:  .Lcfi2013:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1894:
+; NoVLX-NEXT:  .Lcfi2014:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -54002,35 +57998,194 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
-  %1 = bitcast <2 x i64> %__b to <2 x double>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %1 = bitcast <4 x i64> %__b to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
   %4 = bitcast <32 x i1> %3 to i32
   ret i32 %4
 }
 
-define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
+define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %k0
 ; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1895:
+; NoVLX-NEXT:  .Lcfi2015:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1896:
+; NoVLX-NEXT:  .Lcfi2016:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1897:
+; NoVLX-NEXT:  .Lcfi2017:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %load = load <4 x i64>, <4 x i64>* %__b
+  %1 = bitcast <4 x i64> %load to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %4 = bitcast <32 x i1> %3 to i32
+  ret i32 %4
+}
+
+define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi2018:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi2019:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi2020:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <4 x double> undef, double %load, i32 0
+  %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %4 = bitcast <32 x i1> %3 to i32
+  ret i32 %4
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi2021:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi2022:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi2023:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm2, %ymm2
+; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %1 = bitcast <4 x i64> %__b to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi2024:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi2025:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi2026:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vcmpeqpd (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -54039,43 +58194,55 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
-  %load = load <2 x i64>, <2 x i64>* %__b
-  %1 = bitcast <2 x i64> %load to <2 x double>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  %4 = bitcast <32 x i1> %3 to i32
-  ret i32 %4
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %load = load <4 x i64>, <4 x i64>* %__b
+  %1 = bitcast <4 x i64> %load to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
 }
 
-define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
 ; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1898:
+; NoVLX-NEXT:  .Lcfi2027:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1899:
+; NoVLX-NEXT:  .Lcfi2028:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1900:
+; NoVLX-NEXT:  .Lcfi2029:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vbroadcastsd (%rsi), %ymm2
+; NoVLX-NEXT:    vcmpeqpd %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -54084,48 +58251,53 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %0 = bitcast <4 x i64> %__a to <4 x double>
   %load = load double, double* %__b
-  %vec = insertelement <2 x double> undef, double %load, i32 0
-  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  %4 = bitcast <32 x i1> %3 to i32
-  ret i32 %4
+  %vec = insertelement <4 x double> undef, double %load, i32 0
+  %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
 }
 
 
-define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
+
+define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0
+; VLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k0
 ; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1901:
+; NoVLX-NEXT:  .Lcfi2030:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1902:
+; NoVLX-NEXT:  .Lcfi2031:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1903:
+; NoVLX-NEXT:  .Lcfi2032:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -54143,39 +58315,41 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
-  %1 = bitcast <2 x i64> %__b to <2 x double>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %1 = bitcast <4 x i64> %__b to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
   %4 = bitcast <64 x i1> %3 to i64
   ret i64 %4
 }
 
-define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
+define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %k0
+; VLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %k0
 ; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1904:
+; NoVLX-NEXT:  .Lcfi2033:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1905:
+; NoVLX-NEXT:  .Lcfi2034:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1906:
+; NoVLX-NEXT:  .Lcfi2035:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -54193,41 +58367,43 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
-  %load = load <2 x i64>, <2 x i64>* %__b
-  %1 = bitcast <2 x i64> %load to <2 x double>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %load = load <4 x i64>, <4 x i64>* %__b
+  %1 = bitcast <4 x i64> %load to <4 x double>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
   %4 = bitcast <64 x i1> %3 to i64
   ret i64 %4
 }
 
-define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    vcmpeqpd (%rdi){1to4}, %ymm0, %k0
 ; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1907:
+; NoVLX-NEXT:  .Lcfi2036:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1908:
+; NoVLX-NEXT:  .Lcfi2037:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1909:
+; NoVLX-NEXT:  .Lcfi2038:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
-; NoVLX-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vbroadcastsd (%rdi), %ymm1
+; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -54245,180 +58421,192 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <2 x i64> %__a to <2 x double>
+  %0 = bitcast <4 x i64> %__a to <4 x double>
   %load = load double, double* %__b
-  %vec = insertelement <2 x double> undef, double %load, i32 0
-  %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
-  %2 = fcmp oeq <2 x double> %0, %1
-  %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %vec = insertelement <4 x double> undef, double %load, i32 0
+  %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <4 x double> %0, %1
+  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
   %4 = bitcast <64 x i1> %3 to i64
   ret i64 %4
 }
 
-
-define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
+define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi2039:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi2040:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi2041:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    vpmovqd %zmm2, %ymm2
+; NoVLX-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x double>
   %1 = bitcast <4 x i64> %__b to <4 x double>
   %2 = fcmp oeq <4 x double> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %4 = bitcast <8 x i1> %3 to i8
-  ret i8 %4
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
 }
 
-define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi2042:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi2043:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi2044:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT:    vcmpeqpd (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x double>
   %load = load <4 x i64>, <4 x i64>* %__b
   %1 = bitcast <4 x i64> %load to <4 x double>
   %2 = fcmp oeq <4 x double> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %4 = bitcast <8 x i1> %3 to i8
-  ret i8 %4
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
 }
 
-define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi){1to4}, %ymm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT:    vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vbroadcastsd (%rdi), %ymm1
-; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi2045:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi2046:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi2047:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    vpmovqd %zmm1, %ymm1
+; NoVLX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; NoVLX-NEXT:    vbroadcastsd (%rsi), %ymm3
+; NoVLX-NEXT:    vcmpeqpd %ymm3, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x double>
@@ -54426,305 +58614,290 @@ entry:
   %vec = insertelement <4 x double> undef, double %load, i32 0
   %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
   %2 = fcmp oeq <4 x double> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %4 = bitcast <8 x i1> %3 to i8
-  ret i8 %4
+  %3 = bitcast i4 %__u to <4 x i1>
+  %4 = and <4 x i1> %2, %3
+  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
 }
 
 
-define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
+
+define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
 ; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <4 x double>
-  %1 = bitcast <4 x i64> %__b to <4 x double>
-  %2 = fcmp oeq <4 x double> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %0 = bitcast <8 x i64> %__a to <8 x double>
+  %1 = bitcast <8 x i64> %__b to <8 x double>
+  %2 = fcmp oeq <8 x double> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %4 = bitcast <16 x i1> %3 to i16
   ret i16 %4
 }
 
-define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
+define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT:    vcmpeqpd (%rdi), %zmm0, %k0
 ; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vcmpeqpd (%rdi), %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <4 x double>
-  %load = load <4 x i64>, <4 x i64>* %__b
-  %1 = bitcast <4 x i64> %load to <4 x double>
-  %2 = fcmp oeq <4 x double> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %0 = bitcast <8 x i64> %__a to <8 x double>
+  %load = load <8 x i64>, <8 x i64>* %__b
+  %1 = bitcast <8 x i64> %load to <8 x double>
+  %2 = fcmp oeq <8 x double> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %4 = bitcast <16 x i1> %3 to i16
   ret i16 %4
 }
 
-define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
+define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT:    vcmpeqpd (%rdi){1to8}, %zmm0, %k0
 ; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vbroadcastsd (%rdi), %ymm1
-; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
-; NoVLX-NEXT:    andl $1, %eax
-; NoVLX-NEXT:    kmovw %eax, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
-; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
-; NoVLX-NEXT:    korw %k0, %k1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT:    kmovw %eax, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <8 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <8 x double> undef, double %load, i32 0
+  %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <8 x double> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %4 = bitcast <16 x i1> %3 to i16
+  ret i16 %4
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <8 x double>
+  %1 = bitcast <8 x i64> %__b to <8 x double>
+  %2 = fcmp oeq <8 x double> %0, %1
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <4 x double>
+  %0 = bitcast <8 x i64> %__a to <8 x double>
+  %load = load <8 x i64>, <8 x i64>* %__b
+  %1 = bitcast <8 x i64> %load to <8 x double>
+  %2 = fcmp oeq <8 x double> %0, %1
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <8 x double>
   %load = load double, double* %__b
-  %vec = insertelement <4 x double> undef, double %load, i32 0
-  %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  %2 = fcmp oeq <4 x double> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-  %4 = bitcast <16 x i1> %3 to i16
-  ret i16 %4
+  %vec = insertelement <8 x double> undef, double %load, i32 0
+  %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <8 x double> %0, %1
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <16 x i1> %5 to i16
+  ret i16 %6
 }
 
 
-define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
+
+define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT:    vcmplepd {sae}, %zmm1, %zmm0, %k0
 ; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    movzbl %al, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1910:
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1911:
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1912:
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vcmplepd {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzbl %al, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <4 x double>
-  %1 = bitcast <4 x i64> %__b to <4 x double>
-  %2 = fcmp oeq <4 x double> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-  %4 = bitcast <32 x i1> %3 to i32
-  ret i32 %4
+  %0 = bitcast <8 x i64> %__a to <8 x double>
+  %1 = bitcast <8 x i64> %__b to <8 x double>
+  %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8)
+  %3 = zext i8 %2 to i16
+  ret i16 %3
 }
 
-define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
+define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
 ; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    movzbl %al, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1913:
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1914:
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1915:
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzbl %al, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <4 x double>
-  %load = load <4 x i64>, <4 x i64>* %__b
-  %1 = bitcast <4 x i64> %load to <4 x double>
-  %2 = fcmp oeq <4 x double> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-  %4 = bitcast <32 x i1> %3 to i32
-  ret i32 %4
+  %0 = bitcast <8 x i64> %__a to <8 x double>
+  %1 = bitcast <8 x i64> %__b to <8 x double>
+  %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8)
+  %3 = zext i8 %2 to i16
+  ret i16 %3
 }
 
-define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+
+
+define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
 ; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1916:
+; NoVLX-NEXT:  .Lcfi2048:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1917:
+; NoVLX-NEXT:  .Lcfi2049:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1918:
+; NoVLX-NEXT:  .Lcfi2050:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vbroadcastsd (%rdi), %ymm1
-; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -54734,241 +58907,388 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <4 x double>
-  %load = load double, double* %__b
-  %vec = insertelement <4 x double> undef, double %load, i32 0
-  %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  %2 = fcmp oeq <4 x double> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  %0 = bitcast <8 x i64> %__a to <8 x double>
+  %1 = bitcast <8 x i64> %__b to <8 x double>
+  %2 = fcmp oeq <8 x double> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %4 = bitcast <32 x i1> %3 to i32
   ret i32 %4
 }
 
-
-define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
+define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k0
-; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vcmpeqpd (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1919:
+; NoVLX-NEXT:  .Lcfi2051:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1920:
+; NoVLX-NEXT:  .Lcfi2052:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1921:
+; NoVLX-NEXT:  .Lcfi2053:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vcmpeqpd (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <4 x double>
-  %1 = bitcast <4 x i64> %__b to <4 x double>
-  %2 = fcmp oeq <4 x double> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-  %4 = bitcast <64 x i1> %3 to i64
-  ret i64 %4
+  %0 = bitcast <8 x i64> %__a to <8 x double>
+  %load = load <8 x i64>, <8 x i64>* %__b
+  %1 = bitcast <8 x i64> %load to <8 x double>
+  %2 = fcmp oeq <8 x double> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %4 = bitcast <32 x i1> %3 to i32
+  ret i32 %4
 }
 
-define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
+define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %k0
-; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1922:
+; NoVLX-NEXT:  .Lcfi2054:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1923:
+; NoVLX-NEXT:  .Lcfi2055:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1924:
+; NoVLX-NEXT:  .Lcfi2056:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <4 x double>
-  %load = load <4 x i64>, <4 x i64>* %__b
-  %1 = bitcast <4 x i64> %load to <4 x double>
-  %2 = fcmp oeq <4 x double> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-  %4 = bitcast <64 x i1> %3 to i64
-  ret i64 %4
+  %0 = bitcast <8 x i64> %__a to <8 x double>
+  %load = load double, double* %__b
+  %vec = insertelement <8 x double> undef, double %load, i32 0
+  %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %2 = fcmp oeq <8 x double> %0, %1
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %4 = bitcast <32 x i1> %3 to i32
+  ret i32 %4
 }
 
-define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
+define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi){1to4}, %ymm0, %k0
-; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1925:
+; NoVLX-NEXT:  .Lcfi2057:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1926:
+; NoVLX-NEXT:  .Lcfi2058:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1927:
+; NoVLX-NEXT:  .Lcfi2059:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vbroadcastsd (%rdi), %ymm1
-; NoVLX-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
-  %0 = bitcast <4 x i64> %__a to <4 x double>
-  %load = load double, double* %__b
-  %vec = insertelement <4 x double> undef, double %load, i32 0
-  %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  %2 = fcmp oeq <4 x double> %0, %1
-  %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-  %4 = bitcast <64 x i1> %3 to i64
-  ret i64 %4
-}
-
-
-define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; VLX-NEXT:    vzeroupper
-; VLX-NEXT:    retq
-;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
-; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; NoVLX-NEXT:    retq
-entry:
   %0 = bitcast <8 x i64> %__a to <8 x double>
   %1 = bitcast <8 x i64> %__b to <8 x double>
   %2 = fcmp oeq <8 x double> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <16 x i1> %3 to i16
-  ret i16 %4
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
 }
 
-define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
+define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
 ; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpeqpd (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi2060:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi2061:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi2062:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x double>
   %load = load <8 x i64>, <8 x i64>* %__b
   %1 = bitcast <8 x i64> %load to <8 x double>
   %2 = fcmp oeq <8 x double> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <16 x i1> %3 to i16
-  ret i16 %4
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
 }
 
-define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
+define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
 ; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi2063:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi2064:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi2065:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x double>
@@ -54976,62 +59296,89 @@ entry:
   %vec = insertelement <8 x double> undef, double %load, i32 0
   %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   %2 = fcmp oeq <8 x double> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <16 x i1> %3 to i16
-  ret i16 %4
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <32 x i1> %5 to i32
+  ret i32 %6
 }
 
 
-define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
+
+define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
 ; VLX:       # BB#0: # %entry
 ; VLX-NEXT:    vcmplepd {sae}, %zmm1, %zmm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
-; VLX-NEXT:    movzbl %al, %eax
-; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    kmovb %k0, %eax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vcmplepd {sae}, %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzbl %al, %eax
-; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x double>
   %1 = bitcast <8 x i64> %__b to <8 x double>
   %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8)
-  %3 = zext i8 %2 to i16
-  ret i16 %3
+  %3 = zext i8 %2 to i32
+  ret i32 %3
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovb %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzbl %al, %eax
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <8 x double>
+  %1 = bitcast <8 x i64> %__b to <8 x double>
+  %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8)
+  %3 = zext i8 %2 to i32
+  ret i32 %3
 }
 
 
-define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
+
+define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
 ; VLX:       # BB#0: # %entry
 ; VLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1928:
+; NoVLX-NEXT:  .Lcfi2066:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1929:
+; NoVLX-NEXT:  .Lcfi2067:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1930:
+; NoVLX-NEXT:  .Lcfi2068:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -55059,9 +59406,9 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_
 ; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
@@ -55069,7 +59416,10 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
 ; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
@@ -55077,35 +59427,37 @@ entry:
   %0 = bitcast <8 x i64> %__a to <8 x double>
   %1 = bitcast <8 x i64> %__b to <8 x double>
   %2 = fcmp oeq <8 x double> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <32 x i1> %3 to i32
-  ret i32 %4
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %4 = bitcast <64 x i1> %3 to i64
+  ret i64 %4
 }
 
-define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
+define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
 ; VLX:       # BB#0: # %entry
 ; VLX-NEXT:    vcmpeqpd (%rdi), %zmm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1931:
+; NoVLX-NEXT:  .Lcfi2069:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1932:
+; NoVLX-NEXT:  .Lcfi2070:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1933:
+; NoVLX-NEXT:  .Lcfi2071:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vcmpeqpd (%rdi), %zmm0, %k0
 ; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -55133,9 +59485,9 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_
 ; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
@@ -55143,7 +59495,10 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
 ; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
@@ -55152,35 +59507,37 @@ entry:
   %load = load <8 x i64>, <8 x i64>* %__b
   %1 = bitcast <8 x i64> %load to <8 x double>
   %2 = fcmp oeq <8 x double> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <32 x i1> %3 to i32
-  ret i32 %4
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %4 = bitcast <64 x i1> %3 to i64
+  ret i64 %4
 }
 
-define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
+define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
 ; VLX:       # BB#0: # %entry
 ; VLX-NEXT:    vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
+; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1934:
+; NoVLX-NEXT:  .Lcfi2072:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1935:
+; NoVLX-NEXT:  .Lcfi2073:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1936:
+; NoVLX-NEXT:  .Lcfi2074:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vcmpeqpd (%rdi){1to8}, %zmm0, %k0
 ; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -55208,9 +59565,9 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_
 ; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
@@ -55218,7 +59575,10 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
 ; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
 ; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
@@ -55228,56 +59588,34 @@ entry:
   %vec = insertelement <8 x double> undef, double %load, i32 0
   %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   %2 = fcmp oeq <8 x double> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <32 x i1> %3 to i32
-  ret i32 %4
-}
-
-
-define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
-; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmplepd {sae}, %zmm1, %zmm0, %k0
-; VLX-NEXT:    kmovb %k0, %eax
-; VLX-NEXT:    vzeroupper
-; VLX-NEXT:    retq
-;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
-; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vcmplepd {sae}, %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzbl %al, %eax
-; NoVLX-NEXT:    retq
-entry:
-  %0 = bitcast <8 x i64> %__a to <8 x double>
-  %1 = bitcast <8 x i64> %__b to <8 x double>
-  %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 -1, i32 8)
-  %3 = zext i8 %2 to i32
-  ret i32 %3
+  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %4 = bitcast <64 x i1> %3 to i64
+  ret i64 %4
 }
 
-
-define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
+define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
 ; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1937:
+; NoVLX-NEXT:  .Lcfi2075:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1938:
+; NoVLX-NEXT:  .Lcfi2076:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1939:
+; NoVLX-NEXT:  .Lcfi2077:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
@@ -55331,32 +59669,36 @@ entry:
   %0 = bitcast <8 x i64> %__a to <8 x double>
   %1 = bitcast <8 x i64> %__b to <8 x double>
   %2 = fcmp oeq <8 x double> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <64 x i1> %3 to i64
-  ret i64 %4
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
 }
 
-define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
+define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
 ; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1940:
+; NoVLX-NEXT:  .Lcfi2078:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1941:
+; NoVLX-NEXT:  .Lcfi2079:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1942:
+; NoVLX-NEXT:  .Lcfi2080:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vcmpeqpd (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
@@ -55411,32 +59753,36 @@ entry:
   %load = load <8 x i64>, <8 x i64>* %__b
   %1 = bitcast <8 x i64> %load to <8 x double>
   %2 = fcmp oeq <8 x double> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <64 x i1> %3 to i64
-  ret i64 %4
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
 }
 
-define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
+define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
 ; VLX:       # BB#0: # %entry
-; VLX-NEXT:    vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
 ; VLX-NEXT:    kmovq %k0, %rax
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
 ;
-; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1943:
+; NoVLX-NEXT:  .Lcfi2081:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1944:
+; NoVLX-NEXT:  .Lcfi2082:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1945:
+; NoVLX-NEXT:  .Lcfi2083:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vcmpeqpd (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
@@ -55492,12 +59838,15 @@ entry:
   %vec = insertelement <8 x double> undef, double %load, i32 0
   %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   %2 = fcmp oeq <8 x double> %0, %1
-  %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %4 = bitcast <64 x i1> %3 to i64
-  ret i64 %4
+  %3 = bitcast i8 %__u to <8 x i1>
+  %4 = and <8 x i1> %2, %3
+  %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %6 = bitcast <64 x i1> %5 to i64
+  ret i64 %6
 }
 
 
+
 define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
 ; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask:
 ; VLX:       # BB#0: # %entry
@@ -55520,6 +59869,31 @@ entry:
   %3 = zext i8 %2 to i64
   ret i64 %3
 }
+
+define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    movzbl %al, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzbl %al, %eax
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x i64> %__a to <8 x double>
+  %1 = bitcast <8 x i64> %__b to <8 x double>
+  %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8)
+  %3 = zext i8 %2 to i64
+  ret i64 %3
+}
 
 
 




More information about the llvm-commits mailing list