[llvm] r324184 - [X86] Add DAG combine to turn (bitcast (and/or/xor (bitcast X), Y)) -> (and/or/xor X, (bitcast Y)) when casting between GPRs and mask operations.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sat Feb 3 17:43:48 PST 2018


Author: ctopper
Date: Sat Feb  3 17:43:48 2018
New Revision: 324184

URL: http://llvm.org/viewvc/llvm-project?rev=324184&view=rev
Log:
[X86] Add DAG combine to turn (bitcast (and/or/xor (bitcast X), Y)) -> (and/or/xor X, (bitcast Y)) when casting between GPRs and mask operations.

This reduces the number of transitions between k-registers and GPRs, reducing the number of instructions.

There's still some room for improvement to remove more transitions, but this is a good start.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
    llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
    llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
    llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
    llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
    llvm/trunk/test/CodeGen/X86/avx512bw-mask-op.ll
    llvm/trunk/test/CodeGen/X86/avx512bw-vec-test-testn.ll
    llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
    llvm/trunk/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
    llvm/trunk/test/CodeGen/X86/avx512dq-mask-op.ll
    llvm/trunk/test/CodeGen/X86/avx512f-vec-test-testn.ll
    llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
    llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
    llvm/trunk/test/CodeGen/X86/avx512vl-vec-test-testn.ll
    llvm/trunk/test/CodeGen/X86/combine-testm-and.ll
    llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sat Feb  3 17:43:48 2018
@@ -30466,6 +30466,54 @@ static SDValue combineBitcastvxi1(Select
   return DAG.getZExtOrTrunc(V, DL, VT);
 }
 
+static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           const X86Subtarget &Subtarget) {
+  assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
+
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Only do this if we have k-registers.
+  if (!Subtarget.hasAVX512())
+    return SDValue();
+
+  EVT DstVT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  EVT SrcVT = Op.getValueType();
+
+  if (!Op.hasOneUse())
+    return SDValue();
+
+  // Look for logic ops.
+  if (Op.getOpcode() != ISD::AND &&
+      Op.getOpcode() != ISD::OR &&
+      Op.getOpcode() != ISD::XOR)
+    return SDValue();
+
+  // Make sure we have a bitcast between mask registers and a scalar type.
+  if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
+        DstVT.isScalarInteger()) &&
+      !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
+        SrcVT.isScalarInteger()))
+    return SDValue();
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
+      LHS.getOperand(0).getValueType() == DstVT)
+    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
+                       DAG.getBitcast(DstVT, RHS));
+
+  if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
+      RHS.getOperand(0).getValueType() == DstVT)
+    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
+                       DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
+
+  return SDValue();
+}
+
 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
                               TargetLowering::DAGCombinerInfo &DCI,
                               const X86Subtarget &Subtarget) {
@@ -30551,6 +30599,11 @@ static SDValue combineBitcast(SDNode *N,
     }
   }
 
+  // Try to remove bitcasts from input and output of mask arithmetic to
+  // remove GPR<->K-register crossings.
+  if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
+    return V;
+
   // Convert a bitcasted integer logic operation that has one bitcasted
   // floating-point operand into a floating-point logic operation. This may
   // create a load of a constant, but that is cheaper than materializing the

Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll Sat Feb  3 17:43:48 2018
@@ -568,9 +568,9 @@ define i16 @test_pcmpeq_d(<16 x i32> %a,
 define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_pcmpeq_d:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
@@ -593,9 +593,9 @@ define i8 @test_pcmpeq_q(<8 x i64> %a, <
 define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_pcmpeq_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andb %dil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
@@ -618,9 +618,9 @@ define i16 @test_pcmpgt_d(<16 x i32> %a,
 define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_pcmpgt_d:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
@@ -643,9 +643,9 @@ define i8 @test_pcmpgt_q(<8 x i64> %a, <
 define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_pcmpgt_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andb %dil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
@@ -3687,11 +3687,9 @@ define i8 @test_vptestmq(<8 x i64> %a0,
 ; CHECK-LABEL: test_vptestmq:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestmq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %ecx
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vptestmq %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    addb %cl, %al
+; CHECK-NEXT:    andb %al, %dil
+; CHECK-NEXT:    addb %dil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
@@ -3705,11 +3703,9 @@ define i16 @test_vptestmd(<16 x i32> %a0
 ; CHECK-LABEL: test_vptestmd:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestmd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %ecx
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vptestmd %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    andl %eax, %edi
+; CHECK-NEXT:    addl %edi, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
@@ -3725,11 +3721,9 @@ define i16 at test_int_x86_avx512_ptestnm_d
 ; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_512:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vptestnmd %zmm1, %zmm0, %k1 {%k1}
-; CHECK-NEXT:    kmovw %k1, %ecx
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    andl %eax, %edi
+; CHECK-NEXT:    addl %edi, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
@@ -3744,11 +3738,9 @@ define i8 at test_int_x86_avx512_ptestnm_q_
 ; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_512:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestnmq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vptestnmq %zmm1, %zmm0, %k1 {%k1}
-; CHECK-NEXT:    kmovw %k1, %ecx
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    addb %cl, %al
+; CHECK-NEXT:    andb %al, %dil
+; CHECK-NEXT:    addb %dil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
@@ -3762,12 +3754,8 @@ define i16 @test_kand(i16 %a0, i16 %a1)
 ; CHECK-LABEL: test_kand:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    andl %esi, %edi
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    movw $8, %ax
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    andl $8, %edi
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
   %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
@@ -3782,9 +3770,9 @@ define i16 @test_kandn(i16 %a0, i16 %a1)
 ; CHECK-NEXT:    movw $8, %ax
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    kandnw %k1, %k0, %k0
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    kandnw %k1, %k0, %k0
+; CHECK-NEXT:    knotw %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl %esi, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %t1 = call i16 @llvm.x86.avx512.kandn.w(i16 %a0, i16 8)
@@ -3796,10 +3784,8 @@ declare i16 @llvm.x86.avx512.knot.w(i16)
 define i16 @test_knot(i16 %a0) {
 ; CHECK-LABEL: test_knot:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    knotw %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    notl %edi
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
   ret i16 %res
@@ -3810,12 +3796,8 @@ define i16 @test_kor(i16 %a0, i16 %a1) {
 ; CHECK-LABEL: test_kor:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    orl %esi, %edi
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    movw $8, %ax
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    korw %k1, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    orl $8, %edi
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
   %t1 = call i16 @llvm.x86.avx512.kor.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kor.w(i16 %t1, i16 %a1)
@@ -3829,12 +3811,8 @@ define i16 @test_kxnor(i16 %a0, i16 %a1)
 ; CHECK-LABEL: test_kxnor:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    movw $8, %ax
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    kxorw %k1, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    xorl $8, %edi
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
   %t1 = call i16 @llvm.x86.avx512.kxnor.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kxnor.w(i16 %t1, i16 %a1)
@@ -3846,12 +3824,8 @@ define i16 @test_kxor(i16 %a0, i16 %a1)
 ; CHECK-LABEL: test_kxor:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    movw $8, %ax
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    kxorw %k1, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    xorl $8, %edi
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
   %t1 = call i16 @llvm.x86.avx512.kxor.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kxor.w(i16 %t1, i16 %a1)

Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Sat Feb  3 17:43:48 2018
@@ -6,37 +6,11 @@
 
 
 define i16 @mask16(i16 %x) {
-; KNL-LABEL: mask16:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    knotw %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    ## kill: def $ax killed $ax killed $eax
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: mask16:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    kmovd %edi, %k0
-; SKX-NEXT:    knotw %k0, %k0
-; SKX-NEXT:    kmovd %k0, %eax
-; SKX-NEXT:    ## kill: def $ax killed $ax killed $eax
-; SKX-NEXT:    retq
-;
-; AVX512BW-LABEL: mask16:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k0
-; AVX512BW-NEXT:    knotw %k0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    ## kill: def $ax killed $ax killed $eax
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: mask16:
-; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    kmovw %edi, %k0
-; AVX512DQ-NEXT:    knotw %k0, %k0
-; AVX512DQ-NEXT:    kmovw %k0, %eax
-; AVX512DQ-NEXT:    ## kill: def $ax killed $ax killed $eax
-; AVX512DQ-NEXT:    retq
+; CHECK-LABEL: mask16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    notl %edi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %ret = bitcast <16 x i1> %m1 to i16
@@ -44,33 +18,11 @@ define i16 @mask16(i16 %x) {
 }
 
 define i32 @mask16_zext(i16 %x) {
-; KNL-LABEL: mask16_zext:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    knotw %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: mask16_zext:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    kmovd %edi, %k0
-; SKX-NEXT:    knotw %k0, %k0
-; SKX-NEXT:    kmovw %k0, %eax
-; SKX-NEXT:    retq
-;
-; AVX512BW-LABEL: mask16_zext:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k0
-; AVX512BW-NEXT:    knotw %k0, %k0
-; AVX512BW-NEXT:    kmovw %k0, %eax
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: mask16_zext:
-; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    kmovw %edi, %k0
-; AVX512DQ-NEXT:    knotw %k0, %k0
-; AVX512DQ-NEXT:    kmovw %k0, %eax
-; AVX512DQ-NEXT:    retq
+; CHECK-LABEL: mask16_zext:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    notl %edi
+; CHECK-NEXT:    movzwl %di, %eax
+; CHECK-NEXT:    retq
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %m2 = bitcast <16 x i1> %m1 to i16
@@ -81,33 +33,33 @@ define i32 @mask16_zext(i16 %x) {
 define i8 @mask8(i8 %x) {
 ; KNL-LABEL: mask8:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    knotw %k0, %k0
+; KNL-NEXT:    kxnorw %k0, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    xorb %dil, %al
 ; KNL-NEXT:    ## kill: def $al killed $al killed $eax
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: mask8:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    kmovd %edi, %k0
-; SKX-NEXT:    knotb %k0, %k0
+; SKX-NEXT:    kxnorw %k0, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    xorb %dil, %al
 ; SKX-NEXT:    ## kill: def $al killed $al killed $eax
 ; SKX-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask8:
 ; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k0
-; AVX512BW-NEXT:    knotw %k0, %k0
+; AVX512BW-NEXT:    kxnorw %k0, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    xorb %dil, %al
 ; AVX512BW-NEXT:    ## kill: def $al killed $al killed $eax
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask8:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    kmovw %edi, %k0
-; AVX512DQ-NEXT:    knotb %k0, %k0
+; AVX512DQ-NEXT:    kxnorw %k0, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    xorb %dil, %al
 ; AVX512DQ-NEXT:    ## kill: def $al killed $al killed $eax
 ; AVX512DQ-NEXT:    retq
   %m0 = bitcast i8 %x to <8 x i1>
@@ -119,32 +71,34 @@ define i8 @mask8(i8 %x) {
 define i32 @mask8_zext(i8 %x) {
 ; KNL-LABEL: mask8_zext:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    knotw %k0, %k0
+; KNL-NEXT:    kxnorw %k0, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    xorb %dil, %al
 ; KNL-NEXT:    movzbl %al, %eax
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: mask8_zext:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    kmovd %edi, %k0
-; SKX-NEXT:    knotb %k0, %k0
-; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    kxnorw %k0, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    xorb %dil, %al
+; SKX-NEXT:    movzbl %al, %eax
 ; SKX-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mask8_zext:
 ; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k0
-; AVX512BW-NEXT:    knotw %k0, %k0
+; AVX512BW-NEXT:    kxnorw %k0, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    xorb %dil, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask8_zext:
 ; AVX512DQ:       ## %bb.0:
-; AVX512DQ-NEXT:    kmovw %edi, %k0
-; AVX512DQ-NEXT:    knotb %k0, %k0
-; AVX512DQ-NEXT:    kmovb %k0, %eax
+; AVX512DQ-NEXT:    kxnorw %k0, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    xorb %dil, %al
+; AVX512DQ-NEXT:    movzbl %al, %eax
 ; AVX512DQ-NEXT:    retq
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -171,11 +125,9 @@ define void @mask16_mem(i16* %ptr) {
 define void @mask8_mem(i8* %ptr) {
 ; KNL-LABEL: mask8_mem:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    movzbl (%rdi), %eax
-; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    knotw %k0, %k0
+; KNL-NEXT:    kxnorw %k0, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    xorb %al, (%rdi)
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: mask8_mem:
@@ -187,11 +139,9 @@ define void @mask8_mem(i8* %ptr) {
 ;
 ; AVX512BW-LABEL: mask8_mem:
 ; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    movzbl (%rdi), %eax
-; AVX512BW-NEXT:    kmovd %eax, %k0
-; AVX512BW-NEXT:    knotw %k0, %k0
+; AVX512BW-NEXT:    kxnorw %k0, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    xorb %al, (%rdi)
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: mask8_mem:

Modified: llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-schedule.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-schedule.ll Sat Feb  3 17:43:48 2018
@@ -6663,18 +6663,14 @@ define <8 x double> @mov_test47(i8 * %ad
 define i16 @mask16(i16 %x) {
 ; GENERIC-LABEL: mask16:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    kmovd %edi, %k0 # sched: [1:0.33]
-; GENERIC-NEXT:    knotw %k0, %k0 # sched: [1:1.00]
-; GENERIC-NEXT:    kmovd %k0, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    # kill: def $ax killed $ax killed $eax
+; GENERIC-NEXT:    notl %edi # sched: [1:0.33]
+; GENERIC-NEXT:    movl %edi, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mask16:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    kmovd %edi, %k0 # sched: [1:1.00]
-; SKX-NEXT:    knotw %k0, %k0 # sched: [1:1.00]
-; SKX-NEXT:    kmovd %k0, %eax # sched: [3:1.00]
-; SKX-NEXT:    # kill: def $ax killed $ax killed $eax
+; SKX-NEXT:    notl %edi # sched: [1:0.25]
+; SKX-NEXT:    movl %edi, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -6685,16 +6681,15 @@ define i16 @mask16(i16 %x) {
 define i32 @mask16_zext(i16 %x) {
 ; GENERIC-LABEL: mask16_zext:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    kmovd %edi, %k0 # sched: [1:0.33]
-; GENERIC-NEXT:    knotw %k0, %k0 # sched: [1:1.00]
-; GENERIC-NEXT:    kmovw %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    notl %edi # sched: [1:0.33]
+; GENERIC-NEXT:    movzwl %di, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mask16_zext:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    kmovd %edi, %k0 # sched: [1:1.00]
-; SKX-NEXT:    knotw %k0, %k0 # sched: [1:1.00]
-; SKX-NEXT:    kmovw %k0, %eax # sched: [3:1.00]
+; SKX-NEXT:    movl $65535, %eax # imm = 0xFFFF
+; SKX-NEXT:    # sched: [1:0.25]
+; SKX-NEXT:    andnl %eax, %edi, %eax # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -6706,17 +6701,17 @@ define i32 @mask16_zext(i16 %x) {
 define i8 @mask8(i8 %x) {
 ; GENERIC-LABEL: mask8:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    kmovd %edi, %k0 # sched: [1:0.33]
-; GENERIC-NEXT:    knotb %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT:    kxnorw %k0, %k0, %k0 # sched: [1:1.00]
 ; GENERIC-NEXT:    kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    xorb %dil, %al # sched: [1:0.33]
 ; GENERIC-NEXT:    # kill: def $al killed $al killed $eax
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mask8:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    kmovd %edi, %k0 # sched: [1:1.00]
-; SKX-NEXT:    knotb %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT:    kxnorw %k0, %k0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT:    xorb %dil, %al # sched: [1:0.25]
 ; SKX-NEXT:    # kill: def $al killed $al killed $eax
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %m0 = bitcast i8 %x to <8 x i1>
@@ -6728,16 +6723,18 @@ define i8 @mask8(i8 %x) {
 define i32 @mask8_zext(i8 %x) {
 ; GENERIC-LABEL: mask8_zext:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    kmovd %edi, %k0 # sched: [1:0.33]
-; GENERIC-NEXT:    knotb %k0, %k0 # sched: [1:1.00]
-; GENERIC-NEXT:    kmovb %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    kxnorw %k0, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT:    kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    xorb %dil, %al # sched: [1:0.33]
+; GENERIC-NEXT:    movzbl %al, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mask8_zext:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    kmovd %edi, %k0 # sched: [1:1.00]
-; SKX-NEXT:    knotb %k0, %k0 # sched: [1:1.00]
-; SKX-NEXT:    kmovb %k0, %eax # sched: [3:1.00]
+; SKX-NEXT:    kxnorw %k0, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT:    kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT:    xorb %dil, %al # sched: [1:0.25]
+; SKX-NEXT:    movzbl %al, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll Sat Feb  3 17:43:48 2018
@@ -624,14 +624,12 @@ entry:
 define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) {
 ; X32-LABEL: test_mm512_mask_test_epi8_mask:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vptestmb %zmm0, %zmm1, %k2
-; X32-NEXT:    kandd %k1, %k2, %k1
-; X32-NEXT:    kmovd %k1, %eax
-; X32-NEXT:    kshiftrq $32, %k2, %k1
-; X32-NEXT:    kandd %k0, %k1, %k0
-; X32-NEXT:    kmovd %k0, %edx
+; X32-NEXT:    vptestmb %zmm0, %zmm1, %k0
+; X32-NEXT:    kshiftrq $32, %k0, %k1
+; X32-NEXT:    kmovd %k1, %edx
+; X32-NEXT:    kmovd %k0, %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
 ;
@@ -727,14 +725,12 @@ entry:
 define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) {
 ; X32-LABEL: test_mm512_mask_testn_epi8_mask:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vptestnmb %zmm0, %zmm1, %k2
-; X32-NEXT:    kandd %k1, %k2, %k1
-; X32-NEXT:    kmovd %k1, %eax
-; X32-NEXT:    kshiftrq $32, %k2, %k1
-; X32-NEXT:    kandd %k0, %k1, %k0
-; X32-NEXT:    kmovd %k0, %edx
+; X32-NEXT:    vptestnmb %zmm0, %zmm1, %k0
+; X32-NEXT:    kshiftrq $32, %k0, %k1
+; X32-NEXT:    kmovd %k1, %edx
+; X32-NEXT:    kmovd %k0, %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
 ;

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll Sat Feb  3 17:43:48 2018
@@ -402,19 +402,20 @@ define i64 @test_pcmpeq_b(<64 x i8> %a,
 define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 ; AVX512BW-LABEL: test_mask_pcmpeq_b:
 ; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    andq %rdi, %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_mask_pcmpeq_b:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k0, %eax
 ; AVX512F-32-NEXT:    kmovd %k1, %edx
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
@@ -444,17 +445,17 @@ define i32 @test_pcmpeq_w(<32 x i16> %a,
 define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
 ; AVX512BW-LABEL: test_mask_pcmpeq_w:
 ; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_mask_pcmpeq_w:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
@@ -486,19 +487,20 @@ define i64 @test_pcmpgt_b(<64 x i8> %a,
 define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 ; AVX512BW-LABEL: test_mask_pcmpgt_b:
 ; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    andq %rdi, %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_mask_pcmpgt_b:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k0, %eax
 ; AVX512F-32-NEXT:    kmovd %k1, %edx
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
@@ -528,17 +530,17 @@ define i32 @test_pcmpgt_w(<32 x i16> %a,
 define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
 ; AVX512BW-LABEL: test_mask_pcmpgt_w:
 ; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_mask_pcmpgt_w:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
@@ -2480,11 +2482,9 @@ define i64 at test_int_x86_avx512_ptestm_b_
 ; AVX512BW-LABEL: test_int_x86_avx512_ptestm_b_512:
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    vptestmb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vptestmb %zmm1, %zmm0, %k1 {%k1}
-; AVX512BW-NEXT:    kmovq %k1, %rcx
 ; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    andq %rax, %rdi
+; AVX512BW-NEXT:    addq %rdi, %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -2494,14 +2494,13 @@ define i64 at test_int_x86_avx512_ptestm_b_
 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512F-32-NEXT:    .cfi_offset %esi, -8
 ; AVX512F-32-NEXT:    vptestmb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vptestmb %zmm1, %zmm0, %k1 {%k1}
-; AVX512F-32-NEXT:    kshiftrq $32, %k1, %k2
-; AVX512F-32-NEXT:    kmovd %k2, %ecx
-; AVX512F-32-NEXT:    kmovd %k1, %esi
 ; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    kmovd %k1, %ecx
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    andl %ecx, %edx
+; AVX512F-32-NEXT:    kmovd %k0, %esi
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    andl %esi, %eax
 ; AVX512F-32-NEXT:    addl %esi, %eax
 ; AVX512F-32-NEXT:    adcl %ecx, %edx
 ; AVX512F-32-NEXT:    popl %esi
@@ -2519,21 +2518,18 @@ define i32 at test_int_x86_avx512_ptestm_w_
 ; AVX512BW-LABEL: test_int_x86_avx512_ptestm_w_512:
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    vptestmw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vptestmw %zmm1, %zmm0, %k1 {%k1}
-; AVX512BW-NEXT:    kmovd %k1, %ecx
 ; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    andl %eax, %edi
+; AVX512BW-NEXT:    addl %edi, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_w_512:
 ; AVX512F-32:       # %bb.0:
 ; AVX512F-32-NEXT:    vptestmw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vptestmw %zmm1, %zmm0, %k1 {%k1}
-; AVX512F-32-NEXT:    kmovd %k1, %ecx
-; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    andl %ecx, %eax
 ; AVX512F-32-NEXT:    addl %ecx, %eax
 ; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
@@ -2549,11 +2545,9 @@ define i64 at test_int_x86_avx512_ptestnm_b
 ; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_b_512:
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm0, %k1 {%k1}
-; AVX512BW-NEXT:    kmovq %k1, %rcx
 ; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    andq %rax, %rdi
+; AVX512BW-NEXT:    addq %rdi, %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -2563,14 +2557,13 @@ define i64 at test_int_x86_avx512_ptestnm_b
 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512F-32-NEXT:    .cfi_offset %esi, -8
 ; AVX512F-32-NEXT:    vptestnmb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vptestnmb %zmm1, %zmm0, %k1 {%k1}
-; AVX512F-32-NEXT:    kshiftrq $32, %k1, %k2
-; AVX512F-32-NEXT:    kmovd %k2, %ecx
-; AVX512F-32-NEXT:    kmovd %k1, %esi
 ; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    kmovd %k1, %ecx
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    andl %ecx, %edx
+; AVX512F-32-NEXT:    kmovd %k0, %esi
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    andl %esi, %eax
 ; AVX512F-32-NEXT:    addl %esi, %eax
 ; AVX512F-32-NEXT:    adcl %ecx, %edx
 ; AVX512F-32-NEXT:    popl %esi
@@ -2588,21 +2581,18 @@ define i32 at test_int_x86_avx512_ptestnm_w
 ; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_w_512:
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    vptestnmw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vptestnmw %zmm1, %zmm0, %k1 {%k1}
-; AVX512BW-NEXT:    kmovd %k1, %ecx
 ; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    andl %eax, %edi
+; AVX512BW-NEXT:    addl %edi, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_w_512:
 ; AVX512F-32:       # %bb.0:
 ; AVX512F-32-NEXT:    vptestnmw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vptestnmw %zmm1, %zmm0, %k1 {%k1}
-; AVX512F-32-NEXT:    kmovd %k1, %ecx
-; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    andl %ecx, %eax
 ; AVX512F-32-NEXT:    addl %ecx, %eax
 ; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-mask-op.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-mask-op.ll Sat Feb  3 17:43:48 2018
@@ -4,9 +4,9 @@
 define i32 @mask32(i32 %x) {
 ; CHECK-LABEL: mask32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k0
-; CHECK-NEXT:    knotd %k0, %k0
+; CHECK-NEXT:    kxnord %k0, %k0, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    xorl %edi, %eax
 ; CHECK-NEXT:    retq
   %m0 = bitcast i32 %x to <32 x i1>
   %m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
@@ -20,9 +20,9 @@ define i32 @mask32(i32 %x) {
 define i64 @mask64(i64 %x) {
 ; CHECK-LABEL: mask64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovq %rdi, %k0
-; CHECK-NEXT:    knotq %k0, %k0
+; CHECK-NEXT:    kxnorq %k0, %k0, %k0
 ; CHECK-NEXT:    kmovq %k0, %rax
+; CHECK-NEXT:    xorq %rdi, %rax
 ; CHECK-NEXT:    retq
   %m0 = bitcast i64 %x to <64 x i1>
   %m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-vec-test-testn.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-vec-test-testn.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-vec-test-testn.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-vec-test-testn.ll Sat Feb  3 17:43:48 2018
@@ -38,9 +38,9 @@ entry:
 define zeroext i32 @TEST_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm512_mask_test_epi16_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vptestmw %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT:    vptestmw %zmm0, %zmm1, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
@@ -57,9 +57,9 @@ entry:
 define zeroext i64 @TEST_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm512_mask_test_epi8_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovq %rdi, %k1
-; CHECK-NEXT:    vptestmb %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT:    vptestmb %zmm0, %zmm1, %k0
 ; CHECK-NEXT:    kmovq %k0, %rax
+; CHECK-NEXT:    andq %rdi, %rax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
@@ -109,9 +109,9 @@ entry:
 define zeroext i32 @TEST_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm512_mask_testn_epi16_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vptestnmw %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm1, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
@@ -128,9 +128,9 @@ entry:
 define zeroext i64 @TEST_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm512_mask_testn_epi8_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovq %rdi, %k1
-; CHECK-NEXT:    vptestnmb %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT:    vptestnmb %zmm0, %zmm1, %k0
 ; CHECK-NEXT:    kmovq %k0, %rax
+; CHECK-NEXT:    andq %rdi, %rax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:

Modified: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll Sat Feb  3 17:43:48 2018
@@ -487,9 +487,9 @@ define i32 @test_pcmpeq_b_256(<32 x i8>
 define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
 ; CHECK-LABEL: test_mask_pcmpeq_b_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
+; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    andl %edi, %eax ## encoding: [0x21,0xf8]
 ; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
@@ -513,9 +513,9 @@ define i16 @test_pcmpeq_w_256(<16 x i16>
 define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_pcmpeq_w_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1]
+; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    andl %edi, %eax ## encoding: [0x21,0xf8]
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -539,9 +539,9 @@ define i32 @test_pcmpgt_b_256(<32 x i8>
 define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
 ; CHECK-LABEL: test_mask_pcmpgt_b_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
+; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    andl %edi, %eax ## encoding: [0x21,0xf8]
 ; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
@@ -565,9 +565,9 @@ define i16 @test_pcmpgt_w_256(<16 x i16>
 define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_pcmpgt_w_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1]
+; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    andl %edi, %eax ## encoding: [0x21,0xf8]
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -591,9 +591,9 @@ define i16 @test_pcmpeq_b_128(<16 x i8>
 define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_pcmpeq_b_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1]
+; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    andl %edi, %eax ## encoding: [0x21,0xf8]
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
@@ -616,9 +616,9 @@ define i8 @test_pcmpeq_w_128(<8 x i16> %
 define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_pcmpeq_w_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1]
+; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    andb %dil, %al ## encoding: [0x40,0x20,0xf8]
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
@@ -641,9 +641,9 @@ define i16 @test_pcmpgt_b_128(<16 x i8>
 define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_pcmpgt_b_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1]
+; CHECK-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    andl %edi, %eax ## encoding: [0x21,0xf8]
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
@@ -666,9 +666,9 @@ define i8 @test_pcmpgt_w_128(<8 x i16> %
 define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_pcmpgt_w_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc1]
+; CHECK-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x65,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    andb %dil, %al ## encoding: [0x40,0x20,0xf8]
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
@@ -3678,11 +3678,9 @@ define i16 at test_int_x86_avx512_ptestm_b_
 ; CHECK-LABEL: test_int_x86_avx512_ptestm_b_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc1]
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vptestmb %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x26,0xc9]
-; CHECK-NEXT:    kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT:    andl %eax, %edi ## encoding: [0x21,0xc7]
+; CHECK-NEXT:    addl %edi, %eax ## encoding: [0x01,0xf8]
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
@@ -3697,11 +3695,9 @@ define i32 at test_int_x86_avx512_ptestm_b_
 ; CHECK-LABEL: test_int_x86_avx512_ptestm_b_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc1]
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vptestmb %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x26,0xc9]
-; CHECK-NEXT:    kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT:    andl %eax, %edi ## encoding: [0x21,0xc7]
+; CHECK-NEXT:    addl %edi, %eax ## encoding: [0x01,0xf8]
 ; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
@@ -3716,11 +3712,9 @@ define i8 at test_int_x86_avx512_ptestm_w_1
 ; CHECK-LABEL: test_int_x86_avx512_ptestm_w_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x26,0xc1]
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vptestmw %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x26,0xc9]
-; CHECK-NEXT:    kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    andb %al, %dil ## encoding: [0x40,0x20,0xc7]
+; CHECK-NEXT:    addb %dil, %al ## encoding: [0x40,0x00,0xf8]
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
@@ -3735,11 +3729,9 @@ define i16 at test_int_x86_avx512_ptestm_w_
 ; CHECK-LABEL: test_int_x86_avx512_ptestm_w_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc1]
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vptestmw %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x26,0xc9]
-; CHECK-NEXT:    kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT:    andl %eax, %edi ## encoding: [0x21,0xc7]
+; CHECK-NEXT:    addl %edi, %eax ## encoding: [0x01,0xf8]
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3755,11 +3747,9 @@ define i16 at test_int_x86_avx512_ptestnm_b
 ; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestnmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x26,0xc1]
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vptestnmb %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x26,0xc9]
-; CHECK-NEXT:    kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT:    andl %eax, %edi ## encoding: [0x21,0xc7]
+; CHECK-NEXT:    addl %edi, %eax ## encoding: [0x01,0xf8]
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
@@ -3774,11 +3764,9 @@ define i32 at test_int_x86_avx512_ptestnm_b
 ; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestnmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x26,0xc1]
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vptestnmb %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x26,0xc9]
-; CHECK-NEXT:    kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT:    andl %eax, %edi ## encoding: [0x21,0xc7]
+; CHECK-NEXT:    addl %edi, %eax ## encoding: [0x01,0xf8]
 ; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
@@ -3793,11 +3781,9 @@ define i8 at test_int_x86_avx512_ptestnm_w_
 ; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x26,0xc1]
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vptestnmw %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x26,0xc9]
-; CHECK-NEXT:    kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    andb %al, %dil ## encoding: [0x40,0x20,0xc7]
+; CHECK-NEXT:    addb %dil, %al ## encoding: [0x40,0x00,0xf8]
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
@@ -3812,11 +3798,9 @@ define i16 at test_int_x86_avx512_ptestnm_w
 ; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x26,0xc1]
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vptestnmw %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x26,0xc9]
-; CHECK-NEXT:    kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT:    andl %eax, %edi ## encoding: [0x21,0xc7]
+; CHECK-NEXT:    addl %edi, %eax ## encoding: [0x01,0xf8]
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]

Modified: llvm/trunk/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll Sat Feb  3 17:43:48 2018
@@ -21,9 +21,9 @@ entry:
 define zeroext i16 @TEST_mm_mask_test_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm_mask_test_epi8_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vptestmb %xmm0, %xmm1, %k0 {%k1}
+; CHECK-NEXT:    vptestmb %xmm0, %xmm1, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
 entry:
@@ -56,9 +56,9 @@ entry:
 define zeroext i8 @TEST_mm_mask_test_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm_mask_test_epi16_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vptestmw %xmm0, %xmm1, %k0 {%k1}
+; CHECK-NEXT:    vptestmw %xmm0, %xmm1, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andb %dil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 entry:
@@ -91,9 +91,9 @@ entry:
 define zeroext i16 @TEST_mm_mask_testn_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm_mask_testn_epi8_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vptestnmb %xmm0, %xmm1, %k0 {%k1}
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm1, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
 entry:
@@ -126,9 +126,9 @@ entry:
 define zeroext i8 @TEST_mm_mask_testn_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm_mask_testn_epi16_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vptestnmw %xmm0, %xmm1, %k0 {%k1}
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm1, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andb %dil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 entry:
@@ -161,9 +161,9 @@ entry:
 define i32 @TEST_mm256_mask_test_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm256_mask_test_epi8_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vptestmb %ymm0, %ymm1, %k0 {%k1}
+; CHECK-NEXT:    vptestmb %ymm0, %ymm1, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
@@ -197,9 +197,9 @@ entry:
 define zeroext i16 @TEST_mm256_mask_test_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm256_mask_test_epi16_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vptestmw %ymm0, %ymm1, %k0 {%k1}
+; CHECK-NEXT:    vptestmw %ymm0, %ymm1, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -233,9 +233,9 @@ entry:
 define i32 @TEST_mm256_mask_testn_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm256_mask_testn_epi8_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vptestnmb %ymm0, %ymm1, %k0 {%k1}
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm1, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
@@ -269,9 +269,9 @@ entry:
 define zeroext i16 @TEST_mm256_mask_testn_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm256_mask_testn_epi16_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vptestnmw %ymm0, %ymm1, %k0 {%k1}
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm1, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/avx512dq-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512dq-mask-op.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512dq-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512dq-mask-op.ll Sat Feb  3 17:43:48 2018
@@ -4,9 +4,9 @@
 define i8 @mask8(i8 %x) {
 ; CHECK-LABEL: mask8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k0
-; CHECK-NEXT:    knotb %k0, %k0
+; CHECK-NEXT:    kxnorw %k0, %k0, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    xorb %dil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %m0 = bitcast i8 %x to <8 x i1>

Modified: llvm/trunk/test/CodeGen/X86/avx512f-vec-test-testn.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512f-vec-test-testn.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512f-vec-test-testn.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512f-vec-test-testn.ll Sat Feb  3 17:43:48 2018
@@ -38,9 +38,9 @@ entry:
 define zeroext i8 @TEST_mm512_mask_test_epi64_mask(i8 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm512_mask_test_epi64_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT:    vptestmq %zmm0, %zmm1, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andb %dil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -57,9 +57,9 @@ entry:
 define zeroext i16 @TEST_mm512_mask_test_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm512_mask_test_epi32_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vptestmd %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT:    vptestmd %zmm0, %zmm1, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -110,9 +110,9 @@ entry:
 define zeroext i8 @TEST_mm512_mask_testn_epi64_mask(i8 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm512_mask_testn_epi64_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT:    vptestnmq %zmm0, %zmm1, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andb %dil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -129,9 +129,9 @@ entry:
 define zeroext i16 @TEST_mm512_mask_testn_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
 ; CHECK-LABEL: TEST_mm512_mask_testn_epi32_mask:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vptestnmd %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT:    vptestnmd %zmm0, %zmm1, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll Sat Feb  3 17:43:48 2018
@@ -1073,9 +1073,9 @@ define i8 @test_pcmpeq_d_256(<8 x i32> %
 define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_pcmpeq_d_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1]
+; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    andb %dil, %al ## encoding: [0x40,0x20,0xf8]
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
@@ -1123,9 +1123,9 @@ define i8 @test_pcmpgt_d_256(<8 x i32> %
 define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_pcmpgt_d_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc1]
+; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x66,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    andb %dil, %al ## encoding: [0x40,0x20,0xf8]
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
@@ -6013,11 +6013,9 @@ define i8 at test_int_x86_avx512_ptestm_d_2
 ; CHECK-LABEL: test_int_x86_avx512_ptestm_d_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc1]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vptestmd %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x27,0xc9]
-; CHECK-NEXT:    kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    andb %al, %dil ## encoding: [0x40,0x20,0xc7]
+; CHECK-NEXT:    addb %dil, %al ## encoding: [0x40,0x00,0xf8]
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
@@ -6089,11 +6087,9 @@ define i8 at test_int_x86_avx512_ptestnm_d_
 ; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x27,0xc1]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vptestnmd %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x27,0xc9]
-; CHECK-NEXT:    kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    andb %al, %dil ## encoding: [0x40,0x20,0xc7]
+; CHECK-NEXT:    addb %dil, %al ## encoding: [0x40,0x00,0xf8]
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)

Modified: llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll Sat Feb  3 17:43:48 2018
@@ -63,9 +63,9 @@ define zeroext i32 @test_masked_vpcmpeqb
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -91,9 +91,9 @@ define zeroext i32 @test_masked_vpcmpeqb
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -172,10 +172,9 @@ define zeroext i64 @test_masked_vpcmpeqb
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -201,10 +200,9 @@ define zeroext i64 @test_masked_vpcmpeqb
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -294,18 +292,19 @@ define zeroext i64 @test_masked_vpcmpeqb
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    andl %edi, %ecx
+; NoVLX-NEXT:    shll $16, %ecx
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
@@ -331,18 +330,19 @@ define zeroext i64 @test_masked_vpcmpeqb
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    andl %edi, %ecx
+; NoVLX-NEXT:    shll $16, %ecx
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
@@ -756,9 +756,9 @@ define zeroext i32 @test_masked_vpcmpeqw
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -785,9 +785,9 @@ define zeroext i32 @test_masked_vpcmpeqw
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -869,10 +869,9 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -899,10 +898,9 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1247,189 +1245,190 @@ define zeroext i64 @test_masked_vpcmpeqw
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm9
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm4
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm5
+; NoVLX-NEXT:    vmovq %xmm9, %rax
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
 ; NoVLX-NEXT:    shrq $32, %rdx
 ; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vmovq %xmm2, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm3, %xmm10
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm5
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm6
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm0
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm8
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm7
+; NoVLX-NEXT:    movl %eax, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm9, %rdx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %edx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rdx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %eax, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rdx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm1
+; NoVLX-NEXT:    movl %edx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rdx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm8, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm7, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    andl %edi, %ecx
 ; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm10, %ymm5, %ymm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpcmpeqw %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
-; NoVLX-NEXT:    orl %ecx, %eax
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %edx
+; NoVLX-NEXT:    andl %edi, %edx
+; NoVLX-NEXT:    shll $16, %edx
+; NoVLX-NEXT:    movzwl %cx, %eax
+; NoVLX-NEXT:    orl %edx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1454,103 +1453,104 @@ define zeroext i64 @test_masked_vpcmpeqw
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
 ; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %edx, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rdx, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %eax, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm4, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %eax, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    shrl $16, %edi
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpeqw 32(%rsi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpinsrw $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm3, %xmm1
 ; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    andl %edi, %eax
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm0
+; NoVLX-NEXT:    vpcmpeqw 32(%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    andl %edi, %ecx
+; NoVLX-NEXT:    shll $16, %ecx
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
@@ -2935,9 +2935,9 @@ define zeroext i32 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -2962,9 +2962,9 @@ define zeroext i32 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3016,9 +3016,9 @@ define zeroext i32 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3095,10 +3095,9 @@ define zeroext i64 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3123,10 +3122,9 @@ define zeroext i64 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3179,10 +3177,9 @@ define zeroext i64 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5459,9 +5456,9 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5487,9 +5484,9 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5568,10 +5565,9 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5597,10 +5593,9 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5690,18 +5685,19 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    andl %edi, %ecx
+; NoVLX-NEXT:    shll $16, %ecx
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
@@ -5727,18 +5723,19 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vpcmpgtb (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    andl %edi, %ecx
+; NoVLX-NEXT:    shll $16, %ecx
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
@@ -6152,9 +6149,9 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6181,9 +6178,9 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6265,10 +6262,9 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6295,10 +6291,9 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6643,189 +6638,190 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm9
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm4
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm5
+; NoVLX-NEXT:    vmovq %xmm9, %rax
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
 ; NoVLX-NEXT:    shrq $32, %rdx
 ; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vmovq %xmm2, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm3, %xmm10
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm5
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm6
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm0
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm8
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm7
+; NoVLX-NEXT:    movl %eax, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm9, %rdx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %edx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rdx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %eax, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rdx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm1
+; NoVLX-NEXT:    movl %edx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rdx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm8, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm7, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    andl %edi, %ecx
 ; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm10, %ymm5, %ymm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
-; NoVLX-NEXT:    orl %ecx, %eax
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %edx
+; NoVLX-NEXT:    andl %edi, %edx
+; NoVLX-NEXT:    shll $16, %edx
+; NoVLX-NEXT:    movzwl %cx, %eax
+; NoVLX-NEXT:    orl %edx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6850,103 +6846,104 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
 ; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %edx, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rdx, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %eax, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm4, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %eax, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    shrl $16, %edi
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw 32(%rsi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpinsrw $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm3, %xmm1
 ; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    andl %edi, %eax
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm0
+; NoVLX-NEXT:    vpcmpgtw 32(%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    andl %edi, %ecx
+; NoVLX-NEXT:    shll $16, %ecx
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
@@ -8331,9 +8328,9 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8358,9 +8355,9 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8412,9 +8409,9 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8491,10 +8488,9 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8519,10 +8515,9 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8575,10 +8570,9 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -10862,9 +10856,9 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -10893,9 +10887,9 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -10981,10 +10975,9 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11013,10 +11006,9 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11113,21 +11105,22 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    andl %edi, %ecx
+; NoVLX-NEXT:    shll $16, %ecx
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
@@ -11153,22 +11146,23 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    andl %edi, %ecx
+; NoVLX-NEXT:    shll $16, %ecx
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
@@ -11619,9 +11613,9 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11651,9 +11645,9 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11742,10 +11736,9 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11775,10 +11768,9 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12133,193 +12125,194 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm9
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm4
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovq %xmm9, %rax
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vmovq %xmm2, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm3, %xmm10
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm6, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm4
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm6
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm0
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm8
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm7
+; NoVLX-NEXT:    movl %eax, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm9, %rdx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %edx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rdx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %eax, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rdx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm1
+; NoVLX-NEXT:    movl %edx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rdx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm8, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm7, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    andl %edi, %ecx
 ; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
-; NoVLX-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm10, %ymm4, %ymm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
-; NoVLX-NEXT:    orl %ecx, %eax
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %edx
+; NoVLX-NEXT:    andl %edi, %edx
+; NoVLX-NEXT:    shll $16, %edx
+; NoVLX-NEXT:    movzwl %cx, %eax
+; NoVLX-NEXT:    orl %edx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12344,109 +12337,110 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
 ; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %edx, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rdx, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %eax, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm4, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm0
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %eax, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    shrl $16, %edi
-; NoVLX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpinsrw $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vmovdqa 32(%rsi), %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm3, %xmm1
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k2}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    andl %edi, %eax
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm0
+; NoVLX-NEXT:    vmovdqa 32(%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    andl %edi, %ecx
+; NoVLX-NEXT:    shll $16, %ecx
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
@@ -13831,9 +13825,9 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -13858,9 +13852,9 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -13912,9 +13906,9 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpnltd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpnltd (%rsi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -13991,10 +13985,9 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -14019,10 +14012,9 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -14075,10 +14067,9 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpnltd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpnltd (%rsi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16364,9 +16355,9 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16395,9 +16386,9 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16485,10 +16476,9 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16517,10 +16507,9 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16616,21 +16605,22 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    andl %edi, %ecx
+; NoVLX-NEXT:    shll $16, %ecx
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
@@ -16656,21 +16646,22 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    andl %edi, %ecx
+; NoVLX-NEXT:    shll $16, %ecx
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
@@ -17129,9 +17120,9 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17161,9 +17152,9 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17254,10 +17245,9 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17287,10 +17277,9 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17645,194 +17634,195 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm9
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    vmovd %eax, %xmm4
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm5
+; NoVLX-NEXT:    vmovq %xmm9, %rax
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm2, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm3, %xmm10
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm5
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm6
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm0
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm8
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm7
+; NoVLX-NEXT:    movl %eax, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm9, %rdx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %edx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rdx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %eax, %edx
+; NoVLX-NEXT:    shrl $16, %edx
+; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rdx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm1
+; NoVLX-NEXT:    movl %edx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rdx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm8, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor %ymm2, %ymm3, %ymm3
+; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm7, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    andl %edi, %ecx
 ; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; NoVLX-NEXT:    vpxor %ymm4, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm2
-; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpxor %ymm4, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor %ymm4, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm10, %ymm5, %ymm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpxor %ymm2, %ymm3, %ymm1
+; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0 {%k2}
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
-; NoVLX-NEXT:    orl %ecx, %eax
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %edx
+; NoVLX-NEXT:    andl %edi, %edx
+; NoVLX-NEXT:    shll $16, %edx
+; NoVLX-NEXT:    movzwl %cx, %eax
+; NoVLX-NEXT:    orl %edx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17858,107 +17848,108 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
 ; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    movl %edx, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rdx, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %ecx, %edx
+; NoVLX-NEXT:    shrl $16, %edx
 ; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $1, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rdx
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rdx
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %edx, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rdx, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %eax, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm4, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %eax, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vpinsrw $7, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpxor 32(%rsi), %ymm2, %ymm3
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rsi), %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm4
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm4, %ymm0
+; NoVLX-NEXT:    shrq $48, %rdx
+; NoVLX-NEXT:    vpinsrw $7, %edx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    kmovw %edi, %k2
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    andl %edi, %eax
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm0
+; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    andl %edi, %ecx
+; NoVLX-NEXT:    shll $16, %ecx
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
@@ -19343,9 +19334,9 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19370,9 +19361,9 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpltud (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpltud (%rsi), %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19424,9 +19415,9 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpltud (%rsi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19503,10 +19494,9 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19531,10 +19521,9 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpltud (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpltud (%rsi), %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19587,10 +19576,9 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpcmpltud (%rsi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23188,9 +23176,9 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23215,9 +23203,9 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23243,9 +23231,9 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23390,10 +23378,9 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23418,10 +23405,9 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23447,10 +23433,9 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:

Modified: llvm/trunk/test/CodeGen/X86/avx512vl-vec-test-testn.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-vec-test-testn.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-vec-test-testn.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-vec-test-testn.ll Sat Feb  3 17:43:48 2018
@@ -194,19 +194,18 @@ entry:
 define zeroext i8 @TEST_mm256_mask_test_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
 ; X86_64-LABEL: TEST_mm256_mask_test_epi32_mask:
 ; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestmd %ymm0, %ymm1, %k0 {%k1}
+; X86_64-NEXT:    vptestmd %ymm0, %ymm1, %k0
 ; X86_64-NEXT:    kmovw %k0, %eax
+; X86_64-NEXT:    andb %dil, %al
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    vzeroupper
 ; X86_64-NEXT:    retq
 ;
 ; I386-LABEL: TEST_mm256_mask_test_epi32_mask:
 ; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestmd %ymm0, %ymm1, %k0 {%k1}
+; I386-NEXT:    vptestmd %ymm0, %ymm1, %k0
 ; I386-NEXT:    kmovw %k0, %eax
+; I386-NEXT:    andb {{[0-9]+}}(%esp), %al
 ; I386-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-NEXT:    vzeroupper
 ; I386-NEXT:    retl
@@ -412,19 +411,18 @@ entry:
 define zeroext i8 @TEST_mm256_mask_testn_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
 ; X86_64-LABEL: TEST_mm256_mask_testn_epi32_mask:
 ; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestnmd %ymm0, %ymm1, %k0 {%k1}
+; X86_64-NEXT:    vptestnmd %ymm0, %ymm1, %k0
 ; X86_64-NEXT:    kmovw %k0, %eax
+; X86_64-NEXT:    andb %dil, %al
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    vzeroupper
 ; X86_64-NEXT:    retq
 ;
 ; I386-LABEL: TEST_mm256_mask_testn_epi32_mask:
 ; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestnmd %ymm0, %ymm1, %k0 {%k1}
+; I386-NEXT:    vptestnmd %ymm0, %ymm1, %k0
 ; I386-NEXT:    kmovw %k0, %eax
+; I386-NEXT:    andb {{[0-9]+}}(%esp), %al
 ; I386-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-NEXT:    vzeroupper
 ; I386-NEXT:    retl

Modified: llvm/trunk/test/CodeGen/X86/combine-testm-and.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-testm-and.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-testm-and.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-testm-and.ll Sat Feb  3 17:43:48 2018
@@ -17,9 +17,10 @@ define i32 @combineTESTM_AND_1(<8 x i64>
 define i32 @combineTESTM_AND_2(<8 x i64> %a, <8 x i64> %b , i8 %mask) {
 ; CHECK-LABEL: combineTESTM_AND_2:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    vptestmq %zmm0, %zmm1, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andb %dil, %al
+; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %and.i = and <8 x i64> %b, %a
@@ -31,9 +32,10 @@ define i32 @combineTESTM_AND_2(<8 x i64>
 define i32 @combineTESTM_AND_mask_3(<8 x i64> %a, <8 x i64>* %bptr , i8 %mask) {
 ; CHECK-LABEL: combineTESTM_AND_mask_3:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vptestmq (%rdi), %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    vptestmq (%rdi), %zmm0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andb %sil, %al
+; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %b = load <8 x i64>, <8 x i64>* %bptr
@@ -46,9 +48,10 @@ define i32 @combineTESTM_AND_mask_3(<8 x
 define i32 @combineTESTM_AND_mask_4(<8 x i64> %a, <8 x i64>* %bptr , i8 %mask) {
 ; CHECK-LABEL: combineTESTM_AND_mask_4:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vptestmq (%rdi), %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    vptestmq (%rdi), %zmm0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andb %sil, %al
+; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %b = load <8 x i64>, <8 x i64>* %bptr

Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll?rev=324184&r1=324183&r2=324184&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll Sat Feb  3 17:43:48 2018
@@ -540,7 +540,7 @@ define i32 @stack_fold_pcmpeqw(<32 x i16
   ret i32 %3
 }
 
-define i16 @stack_fold_pcmpeqd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
+define <16 x i32> @stack_fold_pcmpeqd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
   ;CHECK-LABEL: stack_fold_pcmpeqd_mask
   ;CHECK:       vpcmpeqd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
@@ -550,11 +550,11 @@ define i16 @stack_fold_pcmpeqd_mask(<16
   %4 = bitcast i16 %mask to <16 x i1>
   %5 = icmp eq <16 x i32> %3, %a0
   %6 = and <16 x i1> %4, %5
-  %7 = bitcast <16 x i1> %6 to i16
-  ret i16 %7
+  %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
+  ret <16 x i32> %7
 }
 
-define i16 @stack_fold_pcmpeqd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
+define <16 x i32> @stack_fold_pcmpeqd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
   ;CHECK-LABEL: stack_fold_pcmpeqd_mask_commuted
   ;CHECK:       vpcmpeqd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
@@ -564,11 +564,11 @@ define i16 @stack_fold_pcmpeqd_mask_comm
   %4 = bitcast i16 %mask to <16 x i1>
   %5 = icmp eq <16 x i32> %a0, %3
   %6 = and <16 x i1> %4, %5
-  %7 = bitcast <16 x i1> %6 to i16
-  ret i16 %7
+  %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
+  ret <16 x i32> %7
 }
 
-define i16 @stack_fold_pcmpled_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
+define <16 x i32> @stack_fold_pcmpled_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
   ;CHECK-LABEL: stack_fold_pcmpled_mask
   ;CHECK:       vpcmpled {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
@@ -578,8 +578,8 @@ define i16 @stack_fold_pcmpled_mask(<16
   %4 = bitcast i16 %mask to <16 x i1>
   %5 = icmp sge <16 x i32> %a0, %3
   %6 = and <16 x i1> %4, %5
-  %7 = bitcast <16 x i1> %6 to i16
-  ret i16 %7
+  %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
+  ret <16 x i32> %7
 }
 
 define i16 @stack_fold_pcmpleud(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {




More information about the llvm-commits mailing list