[llvm] [SDAG] Fix incorrect usage of VECREDUCE_ADD (PR #171459)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 12 06:35:42 PST 2025
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/171459
>From 912d00b1183467f7e88149d19ef35d38d6f5d839 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 9 Dec 2025 15:39:42 +0000
Subject: [PATCH 1/4] [SDAG] Fix incorrect usage of VECREDUCE_ADD
The mask needs to be extended to `i32` before reducing or the reduction
can incorrectly optimized to a VECREDUCE_XOR.
---
.../SelectionDAG/LegalizeVectorTypes.cpp | 3 ++-
.../CodeGen/AArch64/sve-vector-compress.ll | 25 ++++++++++---------
2 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index da3102d30e153..1a82cdc2206e6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2664,7 +2664,8 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo,
// We store LoVec and then insert HiVec starting at offset=|1s| in LoMask.
SDValue WideMask =
- DAG.getNode(ISD::ZERO_EXTEND, DL, LoMask.getValueType(), LoMask);
+ DAG.getNode(ISD::ZERO_EXTEND, DL,
+ LoMask.getValueType().changeElementType(MVT::i32), LoMask);
SDValue Offset = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, WideMask);
Offset = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Offset);
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
index f700dee0fb2e4..cfd343e94baa4 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
@@ -145,17 +145,17 @@ define <vscale x 8 x i32> @test_compress_large(<vscale x 8 x i32> %vec, <vscale
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: cnth x9
-; CHECK-NEXT: ptrue p2.s
-; CHECK-NEXT: sub x9, x9, #1
; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: sub x9, x9, #1
+; CHECK-NEXT: cntp x8, p1, p1.s
; CHECK-NEXT: compact z0.s, p1, z0.s
-; CHECK-NEXT: cntp x8, p2, p1.s
; CHECK-NEXT: compact z1.s, p0, z1.s
-; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: st1w { z1.s }, p2, [x9, x8, lsl #2]
+; CHECK-NEXT: st1w { z1.s }, p0, [x9, x8, lsl #2]
; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #2
@@ -231,23 +231,24 @@ define <4 x double> @test_compress_v4f64_with_sve(<4 x double> %vec, <4 x i1> %m
; CHECK-NEXT: sub sp, sp, #32
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: ushll v2.4s, v2.4h, #0
-; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: movi v5.2s, #1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ushll v3.2d, v2.2s, #0
; CHECK-NEXT: ushll2 v4.2d, v2.4s, #0
-; CHECK-NEXT: fmov x8, d2
+; CHECK-NEXT: and v2.8b, v2.8b, v5.8b
; CHECK-NEXT: shl v3.2d, v3.2d, #63
; CHECK-NEXT: shl v4.2d, v4.2d, #63
-; CHECK-NEXT: lsr x9, x8, #32
-; CHECK-NEXT: eor w8, w8, w9
-; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: addp v2.2s, v2.2s, v2.2s
; CHECK-NEXT: cmlt v3.2d, v3.2d, #0
; CHECK-NEXT: cmlt v4.2d, v4.2d, #0
-; CHECK-NEXT: and x8, x8, #0x3
-; CHECK-NEXT: lsl x8, x8, #3
+; CHECK-NEXT: fmov w8, s2
; CHECK-NEXT: and z3.d, z3.d, #0x1
; CHECK-NEXT: and z4.d, z4.d, #0x1
+; CHECK-NEXT: and x8, x8, #0x3
+; CHECK-NEXT: lsl x8, x8, #3
; CHECK-NEXT: cmpne p1.d, p0/z, z3.d, #0
; CHECK-NEXT: cmpne p0.d, p0/z, z4.d, #0
; CHECK-NEXT: compact z0.d, p1, z0.d
>From 983fc77402c29aa80cfbb4c88c887c3cb9bccad6 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 9 Dec 2025 16:32:18 +0000
Subject: [PATCH 2/4] Update X86 test checks
---
llvm/test/CodeGen/X86/vector-compress.ll | 1625 ++++++++++------------
1 file changed, 743 insertions(+), 882 deletions(-)
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index 1a6351524ffbd..01bdf0a098e7a 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -1583,20 +1583,24 @@ define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8>
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512F-NEXT: vpcompressd %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, (%rsp)
-; AVX512F-NEXT: kshiftrw $8, %k2, %k0
-; AVX512F-NEXT: kxorw %k0, %k2, %k0
-; AVX512F-NEXT: kshiftrw $4, %k0, %k2
-; AVX512F-NEXT: kxorw %k2, %k0, %k0
-; AVX512F-NEXT: kshiftrw $2, %k0, %k2
-; AVX512F-NEXT: kxorw %k2, %k0, %k0
-; AVX512F-NEXT: kshiftrw $1, %k0, %k2
-; AVX512F-NEXT: kxorw %k2, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-NEXT: vpaddd %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT: vpextrd $1, %xmm3, %eax
+; AVX512F-NEXT: vmovd %xmm3, %ecx
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: vpextrd $2, %xmm3, %eax
+; AVX512F-NEXT: vpextrd $3, %xmm3, %edx
+; AVX512F-NEXT: addl %eax, %edx
+; AVX512F-NEXT: addl %ecx, %edx
+; AVX512F-NEXT: andl $31, %edx
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovdb %zmm0, (%rsp,%rax)
+; AVX512F-NEXT: vpmovdb %zmm0, (%rsp,%rdx)
; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm0
; AVX512F-NEXT: vpblendvb %ymm0, (%rsp), %ymm2, %ymm0
; AVX512F-NEXT: movq %rbp, %rsp
@@ -2417,31 +2421,36 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX512F-NEXT: movw $-5, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kandw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw %k1, %k3
+; AVX512F-NEXT: kmovw %k1, %k6
+; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: movzbl 368(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $13, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: movw $-9, %ax
-; AVX512F-NEXT: kmovw %eax, %k5
-; AVX512F-NEXT: kandw %k5, %k0, %k0
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kandw %k1, %k0, %k0
; AVX512F-NEXT: movzbl 376(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $12, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: movw $-17, %ax
-; AVX512F-NEXT: kmovw %eax, %k6
-; AVX512F-NEXT: kandw %k6, %k0, %k0
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kandw %k1, %k0, %k0
; AVX512F-NEXT: movzbl 384(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $11, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: movw $-33, %ax
-; AVX512F-NEXT: kmovw %eax, %k7
-; AVX512F-NEXT: kandw %k7, %k0, %k0
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: kandw %k1, %k0, %k0
+; AVX512F-NEXT: kmovw %k1, %k2
+; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: movzbl 392(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
@@ -2458,8 +2467,9 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: movw $-129, %ax
; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: kandw %k1, %k0, %k0
+; AVX512F-NEXT: kmovw %k1, %k3
+; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: movzbl 408(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
@@ -2476,8 +2486,9 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: movw $-513, %ax # imm = 0xFDFF
; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: kandw %k1, %k0, %k0
+; AVX512F-NEXT: kmovw %k1, %k4
+; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: movzbl 424(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
@@ -2493,9 +2504,8 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX512F-NEXT: kshiftrw $5, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: movw $-2049, %ax # imm = 0xF7FF
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kandw %k1, %k0, %k0
+; AVX512F-NEXT: kmovw %eax, %k5
+; AVX512F-NEXT: kandw %k5, %k0, %k0
; AVX512F-NEXT: movzbl 440(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
@@ -2520,19 +2530,19 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX512F-NEXT: kshiftrw $2, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k1
; AVX512F-NEXT: movw $-16385, %ax # imm = 0xBFFF
-; AVX512F-NEXT: kmovw %eax, %k4
-; AVX512F-NEXT: kandw %k4, %k1, %k1
-; AVX512F-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kmovw %eax, %k0
+; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kandw %k0, %k1, %k1
; AVX512F-NEXT: movzbl 464(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $14, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $14, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kshiftlw $1, %k1, %k1
; AVX512F-NEXT: kshiftrw $1, %k1, %k1
; AVX512F-NEXT: movzbl 472(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: movzbl 224(%rbp), %eax
; AVX512F-NEXT: andl $1, %eax
@@ -2540,110 +2550,100 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX512F-NEXT: kmovw %r10d, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $14, %k1, %k1
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: korw %k1, %k2, %k1
-; AVX512F-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kandw %k3, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: korw %k1, %k7, %k1
+; AVX512F-NEXT: kandw %k6, %k1, %k1
; AVX512F-NEXT: movzbl 240(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $13, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k5, %k1, %k1
-; AVX512F-NEXT: movzbl 248(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $12, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $13, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512F-NEXT: kandw %k6, %k1, %k1
-; AVX512F-NEXT: movzbl 256(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $11, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k7, %k1, %k1
-; AVX512F-NEXT: movzbl 264(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $10, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: movzbl 248(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $12, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: movzbl 256(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $11, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: movzbl 264(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $10, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT: kandw %k2, %k1, %k1
; AVX512F-NEXT: movzbl 272(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $9, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k0
-; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $9, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kandw %k3, %k1, %k1
; AVX512F-NEXT: movzbl 280(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftlw $15, %k1, %k1
-; AVX512F-NEXT: kshiftrw $8, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k0, %k2
-; AVX512F-NEXT: korw %k1, %k2, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $8, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512F-NEXT: kandw %k3, %k1, %k1
; AVX512F-NEXT: movzbl 288(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k0
-; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftlw $15, %k0, %k2
-; AVX512F-NEXT: kshiftrw $7, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $7, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kandw %k4, %k1, %k1
; AVX512F-NEXT: movzbl 296(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k0
-; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftrw $6, %k0, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $6, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512F-NEXT: kandw %k4, %k1, %k1
; AVX512F-NEXT: movzbl 304(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k0
-; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftrw $5, %k0, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $5, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kandw %k5, %k1, %k1
; AVX512F-NEXT: movzbl 312(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k0
-; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftrw $4, %k0, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $4, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT: kandw %k7, %k1, %k1
; AVX512F-NEXT: movzbl 320(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k0
-; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftrw $3, %k0, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $3, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT: kandw %k7, %k1, %k1
; AVX512F-NEXT: movzbl 328(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftrw $2, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k4, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $2, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT: kandw %k7, %k1, %k1
; AVX512F-NEXT: movzbl 336(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftlw $14, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $14, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kshiftlw $1, %k1, %k1
; AVX512F-NEXT: kshiftrw $1, %k1, %k1
; AVX512F-NEXT: movzbl 344(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: movzbl 96(%rbp), %eax
; AVX512F-NEXT: andl $1, %eax
@@ -2651,344 +2651,267 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8>
; AVX512F-NEXT: kmovw %r10d, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $14, %k1, %k1
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: korw %k1, %k2, %k1
-; AVX512F-NEXT: kandw %k3, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: korw %k1, %k7, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT: kandw %k7, %k1, %k1
; AVX512F-NEXT: movzbl 112(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $13, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw %k5, %k4
-; AVX512F-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kandw %k5, %k1, %k1
-; AVX512F-NEXT: movzbl 120(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $12, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw %k6, %k5
-; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $13, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kandw %k6, %k1, %k1
+; AVX512F-NEXT: movzbl 120(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $12, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kandw %k0, %k1, %k1
; AVX512F-NEXT: movzbl 128(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $11, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw %k7, %k6
-; AVX512F-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kandw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $11, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT: kandw %k0, %k1, %k1
; AVX512F-NEXT: movzbl 136(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $10, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512F-NEXT: kandw %k7, %k1, %k1
-; AVX512F-NEXT: movzbl 144(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $9, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $10, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: movzbl 144(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $9, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT: kandw %k0, %k1, %k1
; AVX512F-NEXT: movzbl 152(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $8, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $8, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kandw %k3, %k1, %k1
; AVX512F-NEXT: movzbl 160(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $7, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $7, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512F-NEXT: kandw %k3, %k1, %k1
; AVX512F-NEXT: movzbl 168(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $6, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $6, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kandw %k4, %k1, %k1
; AVX512F-NEXT: movzbl 176(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $5, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $5, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kandw %k5, %k1, %k1
; AVX512F-NEXT: movzbl 184(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $4, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
-; AVX512F-NEXT: movzbl 192(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $3, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $4, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512F-NEXT: kandw %k0, %k1, %k1
-; AVX512F-NEXT: movzbl 200(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $2, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: movzbl 192(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $3, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: movzbl 200(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $2, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k1, %k1
; AVX512F-NEXT: movzbl 208(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $14, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $14, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kshiftlw $1, %k1, %k1
; AVX512F-NEXT: kshiftrw $1, %k1, %k1
; AVX512F-NEXT: movzbl 216(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: andl $1, %edi
-; AVX512F-NEXT: kmovw %esi, %k1
-; AVX512F-NEXT: kshiftlw $15, %k1, %k1
-; AVX512F-NEXT: kshiftrw $14, %k1, %k1
-; AVX512F-NEXT: kmovw %edi, %k2
-; AVX512F-NEXT: korw %k1, %k2, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw %edx, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $13, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k4, %k1, %k1
-; AVX512F-NEXT: kmovw %ecx, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $12, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k5, %k1, %k1
-; AVX512F-NEXT: kmovw %r8d, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $11, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k6, %k1, %k1
-; AVX512F-NEXT: kmovw %r9d, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $10, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw %esi, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $14, %k7, %k7
+; AVX512F-NEXT: kmovw %edi, %k6
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
+; AVX512F-NEXT: kmovw %edx, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $13, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
+; AVX512F-NEXT: kmovw %ecx, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $12, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
+; AVX512F-NEXT: kmovw %r8d, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $11, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
+; AVX512F-NEXT: kmovw %r9d, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $10, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
; AVX512F-NEXT: movzbl 16(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $9, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kandw %k1, %k2, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $9, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
; AVX512F-NEXT: movzbl 24(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $8, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k3, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $8, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
; AVX512F-NEXT: movzbl 32(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $7, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $7, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kandw %k3, %k6, %k6
; AVX512F-NEXT: movzbl 40(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftrw $6, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $6, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kandw %k4, %k6, %k5
; AVX512F-NEXT: movzbl 48(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k5
-; AVX512F-NEXT: kshiftrw $5, %k5, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k6
+; AVX512F-NEXT: kshiftlw $15, %k6, %k6
+; AVX512F-NEXT: kshiftrw $5, %k6, %k6
+; AVX512F-NEXT: korw %k6, %k5, %k5
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512F-NEXT: kandw %k3, %k5, %k4
; AVX512F-NEXT: movzbl 56(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k4
-; AVX512F-NEXT: kshiftrw $4, %k4, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k5
+; AVX512F-NEXT: kshiftlw $15, %k5, %k5
+; AVX512F-NEXT: kshiftrw $4, %k5, %k5
+; AVX512F-NEXT: korw %k5, %k4, %k4
+; AVX512F-NEXT: kandw %k0, %k4, %k3
; AVX512F-NEXT: movzbl 64(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k3
-; AVX512F-NEXT: kshiftrw $3, %k3, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k4
+; AVX512F-NEXT: kshiftlw $15, %k4, %k4
+; AVX512F-NEXT: kshiftrw $3, %k4, %k4
+; AVX512F-NEXT: korw %k4, %k3, %k3
+; AVX512F-NEXT: kandw %k2, %k3, %k2
; AVX512F-NEXT: movzbl 72(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $2, %k2, %k0
-; AVX512F-NEXT: korw %k0, %k1, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kandw %k1, %k0, %k0
+; AVX512F-NEXT: kmovw %eax, %k3
+; AVX512F-NEXT: kshiftlw $15, %k3, %k3
+; AVX512F-NEXT: kshiftrw $2, %k3, %k3
+; AVX512F-NEXT: korw %k3, %k2, %k2
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT: kandw %k0, %k2, %k0
; AVX512F-NEXT: movzbl 80(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: kshiftlw $14, %k1, %k7
-; AVX512F-NEXT: korw %k7, %k0, %k0
+; AVX512F-NEXT: kmovw %eax, %k2
+; AVX512F-NEXT: kshiftlw $14, %k2, %k2
+; AVX512F-NEXT: korw %k2, %k0, %k0
; AVX512F-NEXT: kshiftlw $1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $1, %k0, %k7
+; AVX512F-NEXT: kshiftrw $1, %k0, %k0
; AVX512F-NEXT: movzbl 88(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k0
-; AVX512F-NEXT: kshiftlw $15, %k0, %k6
-; AVX512F-NEXT: korw %k6, %k7, %k6
-; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: movw $-3, %ax
-; AVX512F-NEXT: kmovw %eax, %k6
-; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512F-NEXT: kandw %k6, %k7, %k6
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512F-NEXT: kshiftrw $14, %k7, %k7
-; AVX512F-NEXT: korw %k7, %k6, %k6
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512F-NEXT: kandw %k7, %k6, %k6
-; AVX512F-NEXT: kshiftrw $13, %k5, %k5
-; AVX512F-NEXT: korw %k5, %k6, %k5
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512F-NEXT: kandw %k6, %k5, %k5
-; AVX512F-NEXT: kshiftrw $12, %k4, %k4
-; AVX512F-NEXT: korw %k4, %k5, %k4
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512F-NEXT: kandw %k5, %k4, %k4
-; AVX512F-NEXT: kshiftrw $11, %k3, %k3
-; AVX512F-NEXT: korw %k3, %k4, %k3
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512F-NEXT: kandw %k4, %k3, %k3
-; AVX512F-NEXT: kshiftrw $10, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k3, %k2
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512F-NEXT: kandw %k3, %k2, %k2
-; AVX512F-NEXT: kshiftlw $6, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k2, %k1
-; AVX512F-NEXT: kshiftlw $9, %k1, %k1
-; AVX512F-NEXT: kshiftrw $9, %k1, %k1
-; AVX512F-NEXT: kshiftlw $7, %k0, %k0
-; AVX512F-NEXT: korw %k0, %k1, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftlw $9, %k1, %k1
-; AVX512F-NEXT: kshiftrw $9, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kshiftlw $7, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kxorw %k0, %k1, %k0
-; AVX512F-NEXT: kshiftrw $4, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $2, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $1, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kandw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftrw $14, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kandw %k7, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftrw $13, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kandw %k6, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftrw $12, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kandw %k5, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftrw $11, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kandw %k4, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftrw $10, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kandw %k3, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftlw $6, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftlw $9, %k0, %k0
-; AVX512F-NEXT: kshiftrw $9, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftlw $7, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftlw $9, %k1, %k1
-; AVX512F-NEXT: kshiftrw $9, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kshiftlw $7, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kxorw %k0, %k1, %k0
-; AVX512F-NEXT: kshiftrw $4, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $2, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $1, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: kmovw %eax, %k2
+; AVX512F-NEXT: kshiftlw $15, %k2, %k2
+; AVX512F-NEXT: korw %k2, %k0, %k2
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT: vpcompressd %zmm2, %zmm4 {%k2} {z}
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm7
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero,xmm7[8],zero,zero,zero,xmm7[9],zero,zero,zero,xmm7[10],zero,zero,zero,xmm7[11],zero,zero,zero,xmm7[12],zero,zero,zero,xmm7[13],zero,zero,zero,xmm7[14],zero,zero,zero,xmm7[15],zero,zero,zero
; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512F-NEXT: kxorw %k2, %k3, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $4, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $2, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $1, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %edx
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: vpcompressd %zmm3, %zmm3 {%k1} {z}
-; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 {%k1} {z} = -1
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: vpcompressd %zmm2, %zmm2 {%k1} {z}
-; AVX512F-NEXT: vpternlogd {{.*#+}} zmm5 {%k1} {z} = -1
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512F-NEXT: vpcompressd %zmm6, %zmm6 {%k3} {z}
-; AVX512F-NEXT: vpternlogd {{.*#+}} zmm7 {%k3} {z} = -1
+; AVX512F-NEXT: vpcompressd %zmm3, %zmm5 {%k2} {z}
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k2} {z}
-; AVX512F-NEXT: vpternlogd {{.*#+}} zmm8 {%k2} {z} = -1
-; AVX512F-NEXT: vpmovdb %zmm6, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vpmovdb %zmm0, 64(%rsp,%rax)
-; AVX512F-NEXT: vpmovdb %zmm3, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: andl $31, %ecx
-; AVX512F-NEXT: vpmovdb %zmm2, 96(%rsp,%rcx)
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpcompressd %zmm0, %zmm6 {%k1} {z}
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
+; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero,xmm7[8],zero,zero,zero,xmm7[9],zero,zero,zero,xmm7[10],zero,zero,zero,xmm7[11],zero,zero,zero,xmm7[12],zero,zero,zero,xmm7[13],zero,zero,zero,xmm7[14],zero,zero,zero,xmm7[15],zero,zero,zero
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT: vpcompressd %zmm7, %zmm7 {%k1} {z}
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm8 {%k1} {z} = -1
+; AVX512F-NEXT: vpmovdb %zmm4, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpsrld $31, %zmm2, %zmm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm4, %ymm9
+; AVX512F-NEXT: vpaddd %ymm4, %ymm9, %ymm4
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm9
+; AVX512F-NEXT: vpaddd %xmm4, %xmm9, %xmm4
+; AVX512F-NEXT: vpextrd $1, %xmm4, %eax
+; AVX512F-NEXT: vmovd %xmm4, %ecx
+; AVX512F-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512F-NEXT: vpextrd $3, %xmm4, %esi
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: addl %edx, %esi
+; AVX512F-NEXT: addl %ecx, %esi
+; AVX512F-NEXT: andl $31, %esi
+; AVX512F-NEXT: vpmovdb %zmm6, 64(%rsp,%rsi)
+; AVX512F-NEXT: vpmovdb %zmm5, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpsrld $31, %zmm3, %zmm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm4, %ymm5
+; AVX512F-NEXT: vpaddd %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512F-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; AVX512F-NEXT: vpextrd $1, %xmm4, %eax
+; AVX512F-NEXT: vmovd %xmm4, %ecx
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: vpextrd $2, %xmm4, %eax
+; AVX512F-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512F-NEXT: addl %eax, %edx
+; AVX512F-NEXT: addl %ecx, %edx
+; AVX512F-NEXT: andl $31, %edx
+; AVX512F-NEXT: vpmovdb %zmm7, 96(%rsp,%rdx)
+; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
+; AVX512F-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpsrld $31, %zmm0, %zmm4
+; AVX512F-NEXT: vpsubd %zmm2, %zmm4, %zmm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm4, %ymm5
+; AVX512F-NEXT: vpaddd %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512F-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; AVX512F-NEXT: vpextrd $1, %xmm4, %eax
+; AVX512F-NEXT: vmovd %xmm4, %ecx
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: vpextrd $2, %xmm4, %eax
+; AVX512F-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512F-NEXT: addl %eax, %edx
+; AVX512F-NEXT: addl %ecx, %edx
; AVX512F-NEXT: andl $63, %edx
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX512F-NEXT: vmovaps %ymm0, 128(%rsp,%rdx)
-; AVX512F-NEXT: vpmovdb %zmm4, %xmm0
-; AVX512F-NEXT: vpmovdb %zmm5, %xmm2
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm0, {{[0-9]+}}(%rsp), %ymm2, %ymm0
-; AVX512F-NEXT: vpmovdb %zmm7, %xmm2
-; AVX512F-NEXT: vpmovdb %zmm8, %xmm3
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, {{[0-9]+}}(%rsp), %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
+; AVX512F-NEXT: vmovaps %ymm4, 128(%rsp,%rdx)
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vpmovdb %zmm8, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm3, {{[0-9]+}}(%rsp), %ymm4, %ymm3
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm0, {{[0-9]+}}(%rsp), %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
@@ -3273,26 +3196,30 @@ define <32 x i16> @test_compress_v32i16(<32 x i16> %vec, <32 x i1> %mask, <32 x
; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5
; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5
; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1
-; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
-; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpcompressd %zmm1, %zmm1 {%k2} {z}
-; AVX512F-NEXT: vpmovdw %zmm1, (%rsp)
-; AVX512F-NEXT: kshiftrw $8, %k2, %k0
-; AVX512F-NEXT: kxorw %k0, %k2, %k0
-; AVX512F-NEXT: kshiftrw $4, %k0, %k2
-; AVX512F-NEXT: kxorw %k2, %k0, %k0
-; AVX512F-NEXT: kshiftrw $2, %k0, %k2
-; AVX512F-NEXT: kxorw %k2, %k0, %k0
-; AVX512F-NEXT: kshiftrw $1, %k0, %k2
-; AVX512F-NEXT: kxorw %k2, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm5
+; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5
+; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpcompressd %zmm5, %zmm5 {%k2} {z}
+; AVX512F-NEXT: vpmovdw %zmm5, (%rsp)
+; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; AVX512F-NEXT: vpaddd %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX512F-NEXT: vpaddd %xmm5, %xmm1, %xmm1
+; AVX512F-NEXT: vpextrd $1, %xmm1, %eax
+; AVX512F-NEXT: vmovd %xmm1, %ecx
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: vpextrd $2, %xmm1, %eax
+; AVX512F-NEXT: vpextrd $3, %xmm1, %edx
+; AVX512F-NEXT: addl %eax, %edx
+; AVX512F-NEXT: addl %ecx, %edx
+; AVX512F-NEXT: andl $31, %edx
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovdw %zmm0, (%rsp,%rax,2)
+; AVX512F-NEXT: vpmovdw %zmm0, (%rsp,%rdx,2)
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
; AVX512F-NEXT: vpsllw $15, %ymm4, %ymm1
; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1
@@ -3793,31 +3720,36 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512F-NEXT: movw $-5, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kandw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw %k1, %k3
+; AVX512F-NEXT: kmovw %k1, %k6
+; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: movzbl 368(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $13, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: movw $-9, %ax
-; AVX512F-NEXT: kmovw %eax, %k5
-; AVX512F-NEXT: kandw %k5, %k0, %k0
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kandw %k1, %k0, %k0
; AVX512F-NEXT: movzbl 376(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $12, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: movw $-17, %ax
-; AVX512F-NEXT: kmovw %eax, %k6
-; AVX512F-NEXT: kandw %k6, %k0, %k0
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kandw %k1, %k0, %k0
; AVX512F-NEXT: movzbl 384(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $11, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: movw $-33, %ax
-; AVX512F-NEXT: kmovw %eax, %k7
-; AVX512F-NEXT: kandw %k7, %k0, %k0
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: kandw %k1, %k0, %k0
+; AVX512F-NEXT: kmovw %k1, %k2
+; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: movzbl 392(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
@@ -3834,8 +3766,9 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: movw $-129, %ax
; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: kandw %k1, %k0, %k0
+; AVX512F-NEXT: kmovw %k1, %k3
+; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: movzbl 408(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
@@ -3852,8 +3785,9 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: movw $-513, %ax # imm = 0xFDFF
; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: kandw %k1, %k0, %k0
+; AVX512F-NEXT: kmovw %k1, %k4
+; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: movzbl 424(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
@@ -3869,9 +3803,8 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512F-NEXT: kshiftrw $5, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: movw $-2049, %ax # imm = 0xF7FF
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kandw %k1, %k0, %k0
+; AVX512F-NEXT: kmovw %eax, %k5
+; AVX512F-NEXT: kandw %k5, %k0, %k0
; AVX512F-NEXT: movzbl 440(%rbp), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
@@ -3896,19 +3829,19 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512F-NEXT: kshiftrw $2, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k1
; AVX512F-NEXT: movw $-16385, %ax # imm = 0xBFFF
-; AVX512F-NEXT: kmovw %eax, %k4
-; AVX512F-NEXT: kandw %k4, %k1, %k1
-; AVX512F-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kmovw %eax, %k0
+; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kandw %k0, %k1, %k1
; AVX512F-NEXT: movzbl 464(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $14, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $14, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kshiftlw $1, %k1, %k1
; AVX512F-NEXT: kshiftrw $1, %k1, %k1
; AVX512F-NEXT: movzbl 472(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: movzbl 224(%rbp), %eax
; AVX512F-NEXT: andl $1, %eax
@@ -3916,110 +3849,100 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512F-NEXT: kmovw %r10d, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $14, %k1, %k1
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: korw %k1, %k2, %k1
-; AVX512F-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kandw %k3, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: korw %k1, %k7, %k1
+; AVX512F-NEXT: kandw %k6, %k1, %k1
; AVX512F-NEXT: movzbl 240(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $13, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k5, %k1, %k1
-; AVX512F-NEXT: movzbl 248(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $12, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $13, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512F-NEXT: kandw %k6, %k1, %k1
-; AVX512F-NEXT: movzbl 256(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $11, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k7, %k1, %k1
-; AVX512F-NEXT: movzbl 264(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $10, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: movzbl 248(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $12, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: movzbl 256(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $11, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: movzbl 264(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $10, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT: kandw %k2, %k1, %k1
; AVX512F-NEXT: movzbl 272(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $9, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k0
-; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $9, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kandw %k3, %k1, %k1
; AVX512F-NEXT: movzbl 280(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftlw $15, %k1, %k1
-; AVX512F-NEXT: kshiftrw $8, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k0, %k2
-; AVX512F-NEXT: korw %k1, %k2, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $8, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512F-NEXT: kandw %k3, %k1, %k1
; AVX512F-NEXT: movzbl 288(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k0
-; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftlw $15, %k0, %k2
-; AVX512F-NEXT: kshiftrw $7, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $7, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kandw %k4, %k1, %k1
; AVX512F-NEXT: movzbl 296(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k0
-; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftrw $6, %k0, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $6, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512F-NEXT: kandw %k4, %k1, %k1
; AVX512F-NEXT: movzbl 304(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k0
-; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftrw $5, %k0, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $5, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kandw %k5, %k1, %k1
; AVX512F-NEXT: movzbl 312(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k0
-; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftrw $4, %k0, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $4, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT: kandw %k7, %k1, %k1
; AVX512F-NEXT: movzbl 320(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k0
-; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftrw $3, %k0, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $3, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT: kandw %k7, %k1, %k1
; AVX512F-NEXT: movzbl 328(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftrw $2, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k4, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $2, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT: kandw %k7, %k1, %k1
; AVX512F-NEXT: movzbl 336(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftlw $14, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $14, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kshiftlw $1, %k1, %k1
; AVX512F-NEXT: kshiftrw $1, %k1, %k1
; AVX512F-NEXT: movzbl 344(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512F-NEXT: movzbl 96(%rbp), %eax
; AVX512F-NEXT: andl $1, %eax
@@ -4027,327 +3950,253 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512F-NEXT: kmovw %r10d, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $14, %k1, %k1
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: korw %k1, %k2, %k1
-; AVX512F-NEXT: kandw %k3, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: korw %k1, %k7, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512F-NEXT: kandw %k7, %k1, %k1
; AVX512F-NEXT: movzbl 112(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $13, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw %k5, %k4
-; AVX512F-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kandw %k5, %k1, %k1
-; AVX512F-NEXT: movzbl 120(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $12, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw %k6, %k5
-; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $13, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kandw %k6, %k1, %k1
+; AVX512F-NEXT: movzbl 120(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $12, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kandw %k0, %k1, %k1
; AVX512F-NEXT: movzbl 128(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $11, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw %k7, %k6
-; AVX512F-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kandw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $11, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT: kandw %k0, %k1, %k1
; AVX512F-NEXT: movzbl 136(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $10, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512F-NEXT: kandw %k7, %k1, %k1
-; AVX512F-NEXT: movzbl 144(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $9, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $10, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: movzbl 144(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $9, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT: kandw %k0, %k1, %k1
; AVX512F-NEXT: movzbl 152(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $8, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $8, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kandw %k3, %k1, %k1
; AVX512F-NEXT: movzbl 160(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $7, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $7, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512F-NEXT: kandw %k3, %k1, %k1
; AVX512F-NEXT: movzbl 168(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $6, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $6, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kandw %k4, %k1, %k1
; AVX512F-NEXT: movzbl 176(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $5, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $5, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kandw %k5, %k1, %k1
; AVX512F-NEXT: movzbl 184(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $4, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
-; AVX512F-NEXT: movzbl 192(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $3, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $4, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512F-NEXT: kandw %k0, %k1, %k1
-; AVX512F-NEXT: movzbl 200(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $2, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: movzbl 192(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $3, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: movzbl 200(%rbp), %eax
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $2, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k1, %k1
; AVX512F-NEXT: movzbl 208(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $14, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $14, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: kshiftlw $1, %k1, %k1
; AVX512F-NEXT: kshiftrw $1, %k1, %k1
; AVX512F-NEXT: movzbl 216(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k1, %k1
; AVX512F-NEXT: andl $1, %edi
-; AVX512F-NEXT: kmovw %esi, %k1
-; AVX512F-NEXT: kshiftlw $15, %k1, %k1
-; AVX512F-NEXT: kshiftrw $14, %k1, %k1
-; AVX512F-NEXT: kmovw %edi, %k2
-; AVX512F-NEXT: korw %k1, %k2, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw %edx, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $13, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k4, %k1, %k1
-; AVX512F-NEXT: kmovw %ecx, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $12, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k5, %k1, %k1
-; AVX512F-NEXT: kmovw %r8d, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $11, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k6, %k1, %k1
-; AVX512F-NEXT: kmovw %r9d, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $10, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k7, %k1, %k1
+; AVX512F-NEXT: kmovw %esi, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $14, %k7, %k7
+; AVX512F-NEXT: kmovw %edi, %k6
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
+; AVX512F-NEXT: kmovw %edx, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $13, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
+; AVX512F-NEXT: kmovw %ecx, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $12, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
+; AVX512F-NEXT: kmovw %r8d, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $11, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
+; AVX512F-NEXT: kmovw %r9d, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $10, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
; AVX512F-NEXT: movzbl 16(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $9, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kandw %k1, %k2, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $9, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
; AVX512F-NEXT: movzbl 24(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $8, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k3, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $8, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512F-NEXT: kandw %k5, %k6, %k6
; AVX512F-NEXT: movzbl 32(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $7, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $7, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kandw %k3, %k6, %k6
; AVX512F-NEXT: movzbl 40(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kshiftrw $6, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k7
+; AVX512F-NEXT: kshiftlw $15, %k7, %k7
+; AVX512F-NEXT: kshiftrw $6, %k7, %k7
+; AVX512F-NEXT: korw %k7, %k6, %k6
+; AVX512F-NEXT: kandw %k4, %k6, %k5
; AVX512F-NEXT: movzbl 48(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k5
-; AVX512F-NEXT: kshiftrw $5, %k5, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k6
+; AVX512F-NEXT: kshiftlw $15, %k6, %k6
+; AVX512F-NEXT: kshiftrw $5, %k6, %k6
+; AVX512F-NEXT: korw %k6, %k5, %k5
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512F-NEXT: kandw %k3, %k5, %k4
; AVX512F-NEXT: movzbl 56(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k4
-; AVX512F-NEXT: kshiftrw $4, %k4, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kandw %k2, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k5
+; AVX512F-NEXT: kshiftlw $15, %k5, %k5
+; AVX512F-NEXT: kshiftrw $4, %k5, %k5
+; AVX512F-NEXT: korw %k5, %k4, %k4
+; AVX512F-NEXT: kandw %k0, %k4, %k3
; AVX512F-NEXT: movzbl 64(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k3
-; AVX512F-NEXT: kshiftrw $3, %k3, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kandw %k0, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k4
+; AVX512F-NEXT: kshiftlw $15, %k4, %k4
+; AVX512F-NEXT: kshiftrw $3, %k4, %k4
+; AVX512F-NEXT: korw %k4, %k3, %k3
+; AVX512F-NEXT: kandw %k2, %k3, %k2
; AVX512F-NEXT: movzbl 72(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $2, %k2, %k0
-; AVX512F-NEXT: korw %k0, %k1, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kandw %k1, %k0, %k0
+; AVX512F-NEXT: kmovw %eax, %k3
+; AVX512F-NEXT: kshiftlw $15, %k3, %k3
+; AVX512F-NEXT: kshiftrw $2, %k3, %k3
+; AVX512F-NEXT: korw %k3, %k2, %k2
+; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512F-NEXT: kandw %k0, %k2, %k0
; AVX512F-NEXT: movzbl 80(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: kshiftlw $14, %k1, %k7
-; AVX512F-NEXT: korw %k7, %k0, %k0
+; AVX512F-NEXT: kmovw %eax, %k2
+; AVX512F-NEXT: kshiftlw $14, %k2, %k2
+; AVX512F-NEXT: korw %k2, %k0, %k0
; AVX512F-NEXT: kshiftlw $1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $1, %k0, %k7
+; AVX512F-NEXT: kshiftrw $1, %k0, %k0
; AVX512F-NEXT: movzbl 88(%rbp), %eax
-; AVX512F-NEXT: kmovw %eax, %k0
-; AVX512F-NEXT: kshiftlw $15, %k0, %k6
-; AVX512F-NEXT: korw %k6, %k7, %k6
-; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: movw $-3, %ax
-; AVX512F-NEXT: kmovw %eax, %k6
-; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512F-NEXT: kandw %k6, %k7, %k6
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512F-NEXT: kshiftrw $14, %k7, %k7
-; AVX512F-NEXT: korw %k7, %k6, %k6
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512F-NEXT: kandw %k7, %k6, %k6
-; AVX512F-NEXT: kshiftrw $13, %k5, %k5
-; AVX512F-NEXT: korw %k5, %k6, %k5
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512F-NEXT: kandw %k6, %k5, %k5
-; AVX512F-NEXT: kshiftrw $12, %k4, %k4
-; AVX512F-NEXT: korw %k4, %k5, %k4
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512F-NEXT: kandw %k5, %k4, %k4
-; AVX512F-NEXT: kshiftrw $11, %k3, %k3
-; AVX512F-NEXT: korw %k3, %k4, %k3
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512F-NEXT: kandw %k4, %k3, %k3
-; AVX512F-NEXT: kshiftrw $10, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k3, %k2
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512F-NEXT: kandw %k3, %k2, %k2
-; AVX512F-NEXT: kshiftlw $6, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k2, %k1
-; AVX512F-NEXT: kshiftlw $9, %k1, %k1
-; AVX512F-NEXT: kshiftrw $9, %k1, %k1
-; AVX512F-NEXT: kshiftlw $7, %k0, %k0
-; AVX512F-NEXT: korw %k0, %k1, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftlw $9, %k1, %k1
-; AVX512F-NEXT: kshiftrw $9, %k1, %k1
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kshiftlw $7, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kxorw %k0, %k1, %k0
-; AVX512F-NEXT: kshiftrw $4, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $2, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $1, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: vpcompressd %zmm2, %zmm2 {%k1} {z}
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kandw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftrw $14, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kandw %k7, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftrw $13, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kandw %k6, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftrw $12, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kandw %k5, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftrw $11, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kandw %k4, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftrw $10, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kandw %k3, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftlw $6, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftlw $9, %k0, %k0
-; AVX512F-NEXT: kshiftrw $9, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftlw $7, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: kshiftlw $9, %k1, %k1
-; AVX512F-NEXT: kshiftrw $9, %k1, %k1
+; AVX512F-NEXT: kmovw %eax, %k2
+; AVX512F-NEXT: kshiftlw $15, %k2, %k2
+; AVX512F-NEXT: korw %k2, %k0, %k2
+; AVX512F-NEXT: vpcompressd %zmm0, %zmm4 {%k2} {z}
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: kshiftlw $7, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k1
-; AVX512F-NEXT: kxorw %k0, %k1, %k0
-; AVX512F-NEXT: kshiftrw $4, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $2, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $1, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %ecx
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-NEXT: vpcompressd %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: vpcompressd %zmm2, %zmm2 {%k2} {z}
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm5 {%k2} {z} = -1
; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k2} {z}
-; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512F-NEXT: vpcompressd %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpcompressd %zmm1, %zmm1 {%k1} {z}
-; AVX512F-NEXT: kxorw %k1, %k2, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $4, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $2, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kshiftrw $1, %k0, %k1
-; AVX512F-NEXT: kxorw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %edx
-; AVX512F-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: andl $31, %eax
-; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rsp,%rax,4)
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm6 {%k1} {z} = -1
+; AVX512F-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpsrld $31, %zmm0, %zmm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm4, %ymm7
+; AVX512F-NEXT: vpaddd %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm7
+; AVX512F-NEXT: vpaddd %xmm7, %xmm4, %xmm4
+; AVX512F-NEXT: vpextrd $1, %xmm4, %eax
+; AVX512F-NEXT: vmovd %xmm4, %ecx
+; AVX512F-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512F-NEXT: vpextrd $3, %xmm4, %esi
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: addl %edx, %esi
+; AVX512F-NEXT: addl %ecx, %esi
+; AVX512F-NEXT: andl $31, %esi
+; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rsp,%rsi,4)
; AVX512F-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: andl $31, %ecx
-; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsp,%rcx,4)
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
+; AVX512F-NEXT: vpsrld $31, %zmm5, %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpextrd $1, %xmm1, %eax
+; AVX512F-NEXT: vmovd %xmm1, %ecx
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: vpextrd $2, %xmm1, %eax
+; AVX512F-NEXT: vpextrd $3, %xmm1, %edx
+; AVX512F-NEXT: addl %eax, %edx
+; AVX512F-NEXT: addl %ecx, %edx
+; AVX512F-NEXT: andl $31, %edx
+; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsp,%rdx,4)
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1
-; AVX512F-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2
+; AVX512F-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpsrld $31, %zmm6, %zmm1
+; AVX512F-NEXT: vpsubd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrd $1, %xmm0, %eax
+; AVX512F-NEXT: vmovd %xmm0, %ecx
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: vpextrd $2, %xmm0, %eax
+; AVX512F-NEXT: vpextrd $3, %xmm0, %edx
+; AVX512F-NEXT: addl %eax, %edx
+; AVX512F-NEXT: addl %ecx, %edx
; AVX512F-NEXT: andl $63, %edx
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2
+; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1
; AVX512F-NEXT: vmovaps %zmm0, 320(%rsp,%rdx,4)
-; AVX512F-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %zmm2, 384(%rsp,%rdx,4)
+; AVX512F-NEXT: vmovaps %zmm2, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovaps %zmm1, 384(%rsp,%rdx,4)
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2
@@ -4363,57 +4212,69 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512VL-NEXT: andq $-64, %rsp
; AVX512VL-NEXT: subq $576, %rsp # imm = 0x240
; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovb2m %zmm0, %k1
-; AVX512VL-NEXT: kshiftrq $48, %k1, %k3
-; AVX512VL-NEXT: kshiftrq $32, %k1, %k4
-; AVX512VL-NEXT: kshiftrq $16, %k1, %k2
-; AVX512VL-NEXT: vpcompressd %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpmovb2m %zmm0, %k2
+; AVX512VL-NEXT: kshiftrq $48, %k2, %k1
+; AVX512VL-NEXT: kshiftrq $32, %k2, %k3
+; AVX512VL-NEXT: vpcompressd %zmm1, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp)
-; AVX512VL-NEXT: kshiftrq $8, %k1, %k0
-; AVX512VL-NEXT: kxorw %k0, %k1, %k0
-; AVX512VL-NEXT: kshiftrw $4, %k0, %k5
-; AVX512VL-NEXT: kxorw %k5, %k0, %k0
-; AVX512VL-NEXT: kshiftrw $2, %k0, %k5
-; AVX512VL-NEXT: kxorw %k5, %k0, %k0
-; AVX512VL-NEXT: kshiftrw $1, %k0, %k5
-; AVX512VL-NEXT: kxorw %k5, %k0, %k0
-; AVX512VL-NEXT: kmovd %k0, %eax
-; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpcompressd %zmm2, %zmm0 {%k2} {z}
-; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp,%rax,4)
-; AVX512VL-NEXT: vpcompressd %zmm3, %zmm0 {%k4} {z}
-; AVX512VL-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: kshiftrq $40, %k1, %k0
-; AVX512VL-NEXT: kxorw %k0, %k4, %k0
-; AVX512VL-NEXT: kshiftrw $4, %k0, %k4
-; AVX512VL-NEXT: kxorw %k4, %k0, %k0
-; AVX512VL-NEXT: kshiftrw $2, %k0, %k4
-; AVX512VL-NEXT: kxorw %k4, %k0, %k0
-; AVX512VL-NEXT: kshiftrw $1, %k0, %k4
-; AVX512VL-NEXT: kxorw %k4, %k0, %k0
-; AVX512VL-NEXT: kmovd %k0, %eax
-; AVX512VL-NEXT: andl $31, %eax
-; AVX512VL-NEXT: vpcompressd %zmm4, %zmm0 {%k3} {z}
-; AVX512VL-NEXT: vmovdqa64 %zmm0, 128(%rsp,%rax,4)
-; AVX512VL-NEXT: vmovaps (%rsp), %zmm0
-; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1
-; AVX512VL-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: kxorw %k2, %k1, %k0
-; AVX512VL-NEXT: kshiftrw $8, %k0, %k1
-; AVX512VL-NEXT: kxorw %k1, %k0, %k0
-; AVX512VL-NEXT: kshiftrw $4, %k0, %k1
-; AVX512VL-NEXT: kxorw %k1, %k0, %k0
-; AVX512VL-NEXT: kshiftrw $2, %k0, %k1
-; AVX512VL-NEXT: kxorw %k1, %k0, %k0
-; AVX512VL-NEXT: kshiftrw $1, %k0, %k1
-; AVX512VL-NEXT: kxorw %k1, %k0, %k0
-; AVX512VL-NEXT: kmovd %k0, %eax
-; AVX512VL-NEXT: andl $63, %eax
-; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
+; AVX512VL-NEXT: vpsrld $31, %zmm0, %zmm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; AVX512VL-NEXT: vpaddd %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX512VL-NEXT: vpaddd %xmm5, %xmm1, %xmm1
+; AVX512VL-NEXT: vpextrd $1, %xmm1, %eax
+; AVX512VL-NEXT: vmovd %xmm1, %ecx
+; AVX512VL-NEXT: addl %eax, %ecx
+; AVX512VL-NEXT: vpextrd $2, %xmm1, %eax
+; AVX512VL-NEXT: vpextrd $3, %xmm1, %edx
+; AVX512VL-NEXT: addl %eax, %edx
+; AVX512VL-NEXT: addl %ecx, %edx
+; AVX512VL-NEXT: andl $31, %edx
+; AVX512VL-NEXT: kshiftrq $16, %k2, %k2
+; AVX512VL-NEXT: vpcompressd %zmm2, %zmm1 {%k2} {z}
+; AVX512VL-NEXT: vmovdqa64 %zmm1, (%rsp,%rdx,4)
+; AVX512VL-NEXT: vpcompressd %zmm3, %zmm1 {%k3} {z}
+; AVX512VL-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k3} {z} = -1
+; AVX512VL-NEXT: vpsrld $31, %zmm1, %zmm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpextrd $1, %xmm1, %eax
+; AVX512VL-NEXT: vmovd %xmm1, %ecx
+; AVX512VL-NEXT: addl %eax, %ecx
+; AVX512VL-NEXT: vpextrd $2, %xmm1, %eax
+; AVX512VL-NEXT: vpextrd $3, %xmm1, %edx
+; AVX512VL-NEXT: addl %eax, %edx
+; AVX512VL-NEXT: addl %ecx, %edx
+; AVX512VL-NEXT: andl $31, %edx
+; AVX512VL-NEXT: vpcompressd %zmm4, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vmovdqa64 %zmm1, 128(%rsp,%rdx,4)
+; AVX512VL-NEXT: vmovdqa64 (%rsp), %zmm1
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2
-; AVX512VL-NEXT: vmovaps %zmm0, 256(%rsp,%rax,4)
-; AVX512VL-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps %zmm2, 320(%rsp,%rax,4)
+; AVX512VL-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
+; AVX512VL-NEXT: vpsrld $31, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsubd %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrd $1, %xmm0, %eax
+; AVX512VL-NEXT: vmovd %xmm0, %ecx
+; AVX512VL-NEXT: addl %eax, %ecx
+; AVX512VL-NEXT: vpextrd $2, %xmm0, %eax
+; AVX512VL-NEXT: vpextrd $3, %xmm0, %edx
+; AVX512VL-NEXT: addl %eax, %edx
+; AVX512VL-NEXT: addl %ecx, %edx
+; AVX512VL-NEXT: andl $63, %edx
+; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
+; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1
+; AVX512VL-NEXT: vmovaps %zmm0, 256(%rsp,%rdx,4)
+; AVX512VL-NEXT: vmovaps %zmm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %zmm1, 320(%rsp,%rdx,4)
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2
>From 84f5e703228f698ca14efbfca4d1a89576d74fe7 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 10 Dec 2025 13:47:59 +0000
Subject: [PATCH 3/4] Fixup VT
---
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1a82cdc2206e6..0d2443c0d1155 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2663,9 +2663,10 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo,
MF, cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex());
// We store LoVec and then insert HiVec starting at offset=|1s| in LoMask.
- SDValue WideMask =
- DAG.getNode(ISD::ZERO_EXTEND, DL,
- LoMask.getValueType().changeElementType(MVT::i32), LoMask);
+ EVT WideMaskVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ LoMask.getValueType().getVectorElementCount());
+ SDValue WideMask = DAG.getNode(ISD::ZERO_EXTEND, DL, WideMaskVT, LoMask);
SDValue Offset = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, WideMask);
Offset = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Offset);
>From 2a91c8c0757a1d4756cfc6b7ddf8a44978509291 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 12 Dec 2025 14:34:50 +0000
Subject: [PATCH 4/4] Add assert
---
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 0d2443c0d1155..3572b2e5e4390 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2662,10 +2662,12 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo,
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(
MF, cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex());
+ EVT MaskVT = LoMask.getValueType();
+ assert(MaskVT.getScalarType() == MVT::i1 && "Expected vector of i1s");
+
// We store LoVec and then insert HiVec starting at offset=|1s| in LoMask.
- EVT WideMaskVT =
- EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- LoMask.getValueType().getVectorElementCount());
+ EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ MaskVT.getVectorElementCount());
SDValue WideMask = DAG.getNode(ISD::ZERO_EXTEND, DL, WideMaskVT, LoMask);
SDValue Offset = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, WideMask);
Offset = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Offset);
More information about the llvm-commits
mailing list